QU8 DWCONV microkernels for SSE4.1/AVX/XOP

PiperOrigin-RevId: 383757553
diff --git a/BUILD.bazel b/BUILD.bazel
index 00177a1..b4fe3ed 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3229,6 +3229,10 @@
     "src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c",
     "src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x24.c",
     "src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x32.c",
+    "src/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c",
+    "src/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c",
+    "src/qu8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c",
+    "src/qu8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c",
     "src/qu8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c",
     "src/qu8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c",
     "src/qu8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
@@ -3526,6 +3530,10 @@
     "src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x16.c",
     "src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x24.c",
     "src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x32.c",
+    "src/qu8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c",
+    "src/qu8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c",
+    "src/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c",
+    "src/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c",
     "src/qu8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c",
     "src/qu8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c",
     "src/qu8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c",
@@ -3646,6 +3654,10 @@
     "src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x16.c",
     "src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x24.c",
     "src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x32.c",
+    "src/qu8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c",
+    "src/qu8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c",
+    "src/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c",
+    "src/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c",
     "src/qu8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c",
     "src/qu8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c",
     "src/qu8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
@@ -7741,6 +7753,16 @@
 )
 
 xnnpack_unit_test(
+    name = "qu8_dwconv_minmax_fp32_test",
+    srcs = [
+        "test/qu8-dwconv-minmax-fp32.cc",
+        "test/dwconv-microkernel-tester.h",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
+    deps = MICROKERNEL_TEST_DEPS + [":packing"],
+)
+
+xnnpack_unit_test(
     name = "qu8_dwconv_minmax_gemmlowp_test",
     srcs = [
         "test/qu8-dwconv-minmax-gemmlowp.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dce5c92..4c1826f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2429,6 +2429,10 @@
   src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c
   src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x24.c
   src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x32.c
+  src/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
+  src/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
+  src/qu8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
+  src/qu8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
   src/qu8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
   src/qu8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
   src/qu8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -2723,6 +2727,10 @@
   src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x16.c
   src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x24.c
   src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x32.c
+  src/qu8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
+  src/qu8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
+  src/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
+  src/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
   src/qu8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
   src/qu8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
   src/qu8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -2842,6 +2850,10 @@
   src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x16.c
   src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x24.c
   src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x32.c
+  src/qu8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
+  src/qu8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
+  src/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
+  src/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
   src/qu8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
   src/qu8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
   src/qu8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -5376,6 +5388,15 @@
   TARGET_LINK_LIBRARIES(qu8-avgpool-minmax-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
   ADD_TEST(qu8-avgpool-minmax-test qu8-avgpool-minmax-test)
 
+  ADD_EXECUTABLE(qu8-dwconv-minmax-fp32-test test/qu8-dwconv-minmax-fp32.cc)
+  SET_TARGET_PROPERTIES(qu8-dwconv-minmax-fp32-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(qu8-dwconv-minmax-fp32-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(qu8-dwconv-minmax-fp32-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+  ADD_TEST(qu8-dwconv-minmax-fp32-test qu8-dwconv-minmax-fp32-test)
+
   ADD_EXECUTABLE(qu8-dwconv-minmax-gemmlowp-test test/qu8-dwconv-minmax-gemmlowp.cc)
   SET_TARGET_PROPERTIES(qu8-dwconv-minmax-gemmlowp-test PROPERTIES
     CXX_STANDARD 11
diff --git a/scripts/generate-qs8-dwconv.sh b/scripts/generate-qs8-dwconv.sh
index 4ce1571..4707be3 100755
--- a/scripts/generate-qs8-dwconv.sh
+++ b/scripts/generate-qs8-dwconv.sh
@@ -206,73 +206,91 @@
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul32.c
 
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c
+
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c
 
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
 
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul32.c
 
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c
+
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c
 
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
 
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul32.c
 
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
+
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
 
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
 
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c
 
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c
+
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c
 
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
 
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-xop-mul32.c
 
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c
+
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c
 
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
 
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c
 
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c
+
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
 tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -o src/qs8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c
 
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
-tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=24 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
+tools/xngen src/qs8-dwconv/unipass-sse-mul32.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
 
 ################################### x86 AVX2 ##################################
 tools/xngen src/qs8-dwconv/unipass-avx2-mul16.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9  -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul16.c
@@ -344,5 +362,8 @@
 
 ################################## Unit tests #################################
 tools/generate-dwconv-test.py --spec test/qs8-dwconv-minmax-gemmlowp.yaml --output test/qs8-dwconv-minmax-gemmlowp.cc
-tools/generate-dwconv-test.py --spec test/qs8-dwconv-minmax-fp32.yaml --output test/qs8-dwconv-minmax-fp32.cc
+tools/generate-dwconv-test.py --spec test/qu8-dwconv-minmax-gemmlowp.yaml --output test/qu8-dwconv-minmax-gemmlowp.cc
+
 tools/generate-dwconv-test.py --spec test/qc8-dwconv-minmax-fp32.yaml --output test/qc8-dwconv-minmax-fp32.cc
+tools/generate-dwconv-test.py --spec test/qs8-dwconv-minmax-fp32.yaml --output test/qs8-dwconv-minmax-fp32.cc
+tools/generate-dwconv-test.py --spec test/qu8-dwconv-minmax-fp32.yaml --output test/qu8-dwconv-minmax-fp32.cc
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
index 4b02cc3..4a9d073 100755
--- a/scripts/generate-tests.sh
+++ b/scripts/generate-tests.sh
@@ -4,9 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-### Tests for QU8 micro-kernels
-tools/generate-dwconv-test.py --spec test/qu8-dwconv-minmax-gemmlowp.yaml --output test/qu8-dwconv-minmax-gemmlowp.cc
-
 ### Tests for U8 micro-kernels
 tools/generate-vunary-test.py --spec test/u8-vclamp.yaml --output test/u8-vclamp.cc
 
diff --git a/src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c b/src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
index 0831988..19a6604 100644
--- a/src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
+++ b/src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
@@ -169,11 +169,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -184,11 +184,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -199,11 +199,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -214,11 +214,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -229,11 +229,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -244,11 +244,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -259,11 +259,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -274,11 +274,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -289,11 +289,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -304,11 +304,11 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i9 += 16;
 
@@ -319,11 +319,11 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i10 += 16;
 
@@ -334,11 +334,11 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i11 += 16;
 
@@ -349,11 +349,11 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(int8_t))));
       i12 += 16;
 
@@ -364,11 +364,11 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(int8_t))));
       i13 += 16;
 
@@ -379,11 +379,11 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i14 += 16;
 
@@ -394,11 +394,11 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(int8_t))));
       i15 += 16;
 
@@ -409,11 +409,11 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(int8_t))));
       i16 += 16;
 
@@ -424,11 +424,11 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i17 += 16;
 
@@ -439,11 +439,11 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(int8_t))));
       i18 += 16;
 
@@ -454,11 +454,11 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(int8_t))));
       i19 += 16;
 
@@ -469,11 +469,11 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i20 += 16;
 
@@ -484,11 +484,11 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(int8_t))));
       i21 += 16;
 
@@ -499,11 +499,11 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(int8_t))));
       i22 += 16;
 
@@ -514,11 +514,11 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i23 += 16;
 
@@ -529,11 +529,11 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(int8_t))));
       i24 += 16;
 
@@ -582,151 +582,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 208)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 224)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 256)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 272)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 304)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 320)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 352)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 368)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i24 += 4;
diff --git a/src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c b/src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
index 06adbad..0f65a66 100644
--- a/src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
+++ b/src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
@@ -169,11 +169,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -184,11 +184,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -199,11 +199,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -214,11 +214,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -229,11 +229,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -244,11 +244,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -259,11 +259,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -274,11 +274,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -289,11 +289,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -304,11 +304,11 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i9 += 16;
 
@@ -319,11 +319,11 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i10 += 16;
 
@@ -334,11 +334,11 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i11 += 16;
 
@@ -349,11 +349,11 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(int8_t))));
       i12 += 16;
 
@@ -364,11 +364,11 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(int8_t))));
       i13 += 16;
 
@@ -379,11 +379,11 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i14 += 16;
 
@@ -394,11 +394,11 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(int8_t))));
       i15 += 16;
 
@@ -409,11 +409,11 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(int8_t))));
       i16 += 16;
 
@@ -424,11 +424,11 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i17 += 16;
 
@@ -439,11 +439,11 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(int8_t))));
       i18 += 16;
 
@@ -454,11 +454,11 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(int8_t))));
       i19 += 16;
 
@@ -469,11 +469,11 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i20 += 16;
 
@@ -484,11 +484,11 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(int8_t))));
       i21 += 16;
 
@@ -499,11 +499,11 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(int8_t))));
       i22 += 16;
 
@@ -514,11 +514,11 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i23 += 16;
 
@@ -529,11 +529,11 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(int8_t))));
       i24 += 16;
 
@@ -582,151 +582,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 208)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 224)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 256)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 272)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 304)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 320)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 352)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 368)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i24 += 4;
diff --git a/src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c b/src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
index 198e8cf..7329fa1 100644
--- a/src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
+++ b/src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
@@ -174,11 +174,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -189,11 +189,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -204,11 +204,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -219,11 +219,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -234,11 +234,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -249,11 +249,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -264,11 +264,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -279,11 +279,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -294,11 +294,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -309,11 +309,11 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i9 += 16;
 
@@ -324,11 +324,11 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i10 += 16;
 
@@ -339,11 +339,11 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i11 += 16;
 
@@ -354,11 +354,11 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(int8_t))));
       i12 += 16;
 
@@ -369,11 +369,11 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(int8_t))));
       i13 += 16;
 
@@ -384,11 +384,11 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i14 += 16;
 
@@ -399,11 +399,11 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(int8_t))));
       i15 += 16;
 
@@ -414,11 +414,11 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(int8_t))));
       i16 += 16;
 
@@ -429,11 +429,11 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i17 += 16;
 
@@ -444,11 +444,11 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(int8_t))));
       i18 += 16;
 
@@ -459,11 +459,11 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(int8_t))));
       i19 += 16;
 
@@ -474,11 +474,11 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i20 += 16;
 
@@ -489,11 +489,11 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(int8_t))));
       i21 += 16;
 
@@ -504,11 +504,11 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(int8_t))));
       i22 += 16;
 
@@ -519,11 +519,11 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i23 += 16;
 
@@ -534,11 +534,11 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(int8_t))));
       i24 += 16;
 
@@ -587,151 +587,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
 
         vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i9 += 4;
 
         vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i10 += 4;
 
         vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i11 += 4;
 
         vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i12 += 4;
 
         vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 208)));
         i13 += 4;
 
         vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 224)));
         i14 += 4;
 
         vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i15 += 4;
 
         vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 256)));
         i16 += 4;
 
         vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 272)));
         i17 += 4;
 
         vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i18 += 4;
 
         vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 304)));
         i19 += 4;
 
         vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 320)));
         i20 += 4;
 
         vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i21 += 4;
 
         vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 352)));
         i22 += 4;
 
         vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 368)));
         i23 += 4;
 
         vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i24 += 4;
diff --git a/src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c b/src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
index a925cd2..6993bb3 100644
--- a/src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
+++ b/src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
@@ -89,11 +89,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -104,11 +104,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -119,11 +119,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -134,11 +134,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -149,11 +149,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -164,11 +164,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -179,11 +179,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -194,11 +194,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -209,11 +209,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -262,55 +262,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
diff --git a/src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c b/src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
index dd52855..e90a60e 100644
--- a/src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
+++ b/src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
@@ -89,11 +89,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -104,11 +104,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -119,11 +119,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -134,11 +134,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -149,11 +149,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -164,11 +164,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -179,11 +179,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -194,11 +194,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -209,11 +209,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -262,55 +262,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
diff --git a/src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c b/src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
index 5fd55e7..4cf1015 100644
--- a/src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
+++ b/src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
@@ -94,11 +94,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -109,11 +109,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -124,11 +124,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -139,11 +139,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -154,11 +154,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -169,11 +169,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -184,11 +184,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -199,11 +199,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -214,11 +214,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -267,55 +267,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
diff --git a/src/qc8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c b/src/qc8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c
index 96d8444..d0041b0 100644
--- a/src/qc8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c
+++ b/src/qc8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c
@@ -171,15 +171,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -192,15 +192,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -213,15 +213,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -234,15 +234,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -255,15 +255,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -276,15 +276,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -297,15 +297,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -318,15 +318,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -339,15 +339,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -360,15 +360,15 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 220 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 16)));
+      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 16));
       const __m128i vk9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 20)));
+      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 20));
       const __m128i vk9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i9 += 24;
 
@@ -381,15 +381,15 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 252 * sizeof(int8_t))));
-      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 16)));
+      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 16));
       const __m128i vk10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 20)));
+      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 20));
       const __m128i vk10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 260 * sizeof(int8_t))));
       i10 += 24;
 
@@ -402,15 +402,15 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 268 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 16)));
+      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 16));
       const __m128i vk11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 20)));
+      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 20));
       const __m128i vk11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i11 += 24;
 
@@ -423,15 +423,15 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 300 * sizeof(int8_t))));
-      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 16)));
+      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 16));
       const __m128i vk12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 20)));
+      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 20));
       const __m128i vk12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 308 * sizeof(int8_t))));
       i12 += 24;
 
@@ -444,15 +444,15 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 316 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 16)));
+      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 16));
       const __m128i vk13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 20)));
+      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 20));
       const __m128i vk13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i13 += 24;
 
@@ -465,15 +465,15 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 348 * sizeof(int8_t))));
-      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 16)));
+      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 16));
       const __m128i vk14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 20)));
+      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 20));
       const __m128i vk14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 356 * sizeof(int8_t))));
       i14 += 24;
 
@@ -486,15 +486,15 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 364 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 16)));
+      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 16));
       const __m128i vk15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 20)));
+      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 20));
       const __m128i vk15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i15 += 24;
 
@@ -507,15 +507,15 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 396 * sizeof(int8_t))));
-      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 16)));
+      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 16));
       const __m128i vk16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(int8_t))));
-      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 20)));
+      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 20));
       const __m128i vk16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 404 * sizeof(int8_t))));
       i16 += 24;
 
@@ -528,15 +528,15 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 412 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 420 * sizeof(int8_t))));
-      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 16)));
+      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 16));
       const __m128i vk17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(int8_t))));
-      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 20)));
+      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 20));
       const __m128i vk17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 428 * sizeof(int8_t))));
       i17 += 24;
 
@@ -549,15 +549,15 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 436 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 444 * sizeof(int8_t))));
-      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 16)));
+      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 16));
       const __m128i vk18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(int8_t))));
-      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 20)));
+      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 20));
       const __m128i vk18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 452 * sizeof(int8_t))));
       i18 += 24;
 
@@ -570,15 +570,15 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 460 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 468 * sizeof(int8_t))));
-      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 16)));
+      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 16));
       const __m128i vk19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(int8_t))));
-      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 20)));
+      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 20));
       const __m128i vk19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 476 * sizeof(int8_t))));
       i19 += 24;
 
@@ -591,15 +591,15 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 484 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 492 * sizeof(int8_t))));
-      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 16)));
+      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 16));
       const __m128i vk20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(int8_t))));
-      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 20)));
+      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 20));
       const __m128i vk20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 500 * sizeof(int8_t))));
       i20 += 24;
 
@@ -612,15 +612,15 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 508 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 516 * sizeof(int8_t))));
-      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 16)));
+      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 16));
       const __m128i vk21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(int8_t))));
-      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 20)));
+      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 20));
       const __m128i vk21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 524 * sizeof(int8_t))));
       i21 += 24;
 
@@ -633,15 +633,15 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 532 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 540 * sizeof(int8_t))));
-      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 16)));
+      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 16));
       const __m128i vk22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(int8_t))));
-      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 20)));
+      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 20));
       const __m128i vk22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 548 * sizeof(int8_t))));
       i22 += 24;
 
@@ -654,15 +654,15 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 556 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 564 * sizeof(int8_t))));
-      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 16)));
+      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 16));
       const __m128i vk23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(int8_t))));
-      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 20)));
+      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 20));
       const __m128i vk23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 572 * sizeof(int8_t))));
       i23 += 24;
 
@@ -675,15 +675,15 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 580 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 588 * sizeof(int8_t))));
-      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 16)));
+      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 16));
       const __m128i vk24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(int8_t))));
-      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 20)));
+      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 20));
       const __m128i vk24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 596 * sizeof(int8_t))));
       i24 += 24;
 
@@ -747,151 +747,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 216)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 264)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 312)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 360)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 408)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 432)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 456)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 480)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 504)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 528)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 552)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 576)));
         i24 += 4;
diff --git a/src/qc8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c b/src/qc8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c
index f505a66..40807a7 100644
--- a/src/qc8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c
+++ b/src/qc8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c
@@ -171,15 +171,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -192,15 +192,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -213,15 +213,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -234,15 +234,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -255,15 +255,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -276,15 +276,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -297,15 +297,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -318,15 +318,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -339,15 +339,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -360,15 +360,15 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 220 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 16)));
+      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 16));
       const __m128i vk9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 20)));
+      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 20));
       const __m128i vk9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i9 += 24;
 
@@ -381,15 +381,15 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 252 * sizeof(int8_t))));
-      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 16)));
+      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 16));
       const __m128i vk10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 20)));
+      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 20));
       const __m128i vk10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 260 * sizeof(int8_t))));
       i10 += 24;
 
@@ -402,15 +402,15 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 268 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 16)));
+      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 16));
       const __m128i vk11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 20)));
+      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 20));
       const __m128i vk11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i11 += 24;
 
@@ -423,15 +423,15 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 300 * sizeof(int8_t))));
-      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 16)));
+      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 16));
       const __m128i vk12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 20)));
+      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 20));
       const __m128i vk12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 308 * sizeof(int8_t))));
       i12 += 24;
 
@@ -444,15 +444,15 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 316 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 16)));
+      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 16));
       const __m128i vk13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 20)));
+      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 20));
       const __m128i vk13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i13 += 24;
 
@@ -465,15 +465,15 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 348 * sizeof(int8_t))));
-      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 16)));
+      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 16));
       const __m128i vk14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 20)));
+      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 20));
       const __m128i vk14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 356 * sizeof(int8_t))));
       i14 += 24;
 
@@ -486,15 +486,15 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 364 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 16)));
+      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 16));
       const __m128i vk15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 20)));
+      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 20));
       const __m128i vk15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i15 += 24;
 
@@ -507,15 +507,15 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 396 * sizeof(int8_t))));
-      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 16)));
+      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 16));
       const __m128i vk16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(int8_t))));
-      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 20)));
+      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 20));
       const __m128i vk16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 404 * sizeof(int8_t))));
       i16 += 24;
 
@@ -528,15 +528,15 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 412 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 420 * sizeof(int8_t))));
-      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 16)));
+      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 16));
       const __m128i vk17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(int8_t))));
-      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 20)));
+      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 20));
       const __m128i vk17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 428 * sizeof(int8_t))));
       i17 += 24;
 
@@ -549,15 +549,15 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 436 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 444 * sizeof(int8_t))));
-      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 16)));
+      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 16));
       const __m128i vk18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(int8_t))));
-      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 20)));
+      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 20));
       const __m128i vk18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 452 * sizeof(int8_t))));
       i18 += 24;
 
@@ -570,15 +570,15 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 460 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 468 * sizeof(int8_t))));
-      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 16)));
+      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 16));
       const __m128i vk19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(int8_t))));
-      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 20)));
+      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 20));
       const __m128i vk19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 476 * sizeof(int8_t))));
       i19 += 24;
 
@@ -591,15 +591,15 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 484 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 492 * sizeof(int8_t))));
-      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 16)));
+      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 16));
       const __m128i vk20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(int8_t))));
-      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 20)));
+      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 20));
       const __m128i vk20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 500 * sizeof(int8_t))));
       i20 += 24;
 
@@ -612,15 +612,15 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 508 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 516 * sizeof(int8_t))));
-      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 16)));
+      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 16));
       const __m128i vk21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(int8_t))));
-      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 20)));
+      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 20));
       const __m128i vk21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 524 * sizeof(int8_t))));
       i21 += 24;
 
@@ -633,15 +633,15 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 532 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 540 * sizeof(int8_t))));
-      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 16)));
+      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 16));
       const __m128i vk22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(int8_t))));
-      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 20)));
+      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 20));
       const __m128i vk22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 548 * sizeof(int8_t))));
       i22 += 24;
 
@@ -654,15 +654,15 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 556 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 564 * sizeof(int8_t))));
-      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 16)));
+      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 16));
       const __m128i vk23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(int8_t))));
-      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 20)));
+      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 20));
       const __m128i vk23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 572 * sizeof(int8_t))));
       i23 += 24;
 
@@ -675,15 +675,15 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 580 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 588 * sizeof(int8_t))));
-      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 16)));
+      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 16));
       const __m128i vk24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(int8_t))));
-      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 20)));
+      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 20));
       const __m128i vk24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 596 * sizeof(int8_t))));
       i24 += 24;
 
@@ -747,151 +747,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 216)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 264)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 312)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 360)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 408)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 432)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 456)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 480)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 504)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 528)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 552)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 576)));
         i24 += 4;
diff --git a/src/qc8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c b/src/qc8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c
index 34a245c..51766ae 100644
--- a/src/qc8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c
+++ b/src/qc8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c
@@ -176,15 +176,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -197,15 +197,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -218,15 +218,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -239,15 +239,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -260,15 +260,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -281,15 +281,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -302,15 +302,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -323,15 +323,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -344,15 +344,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -365,15 +365,15 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 220 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 16)));
+      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 16));
       const __m128i vk9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 20)));
+      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 20));
       const __m128i vk9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i9 += 24;
 
@@ -386,15 +386,15 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 252 * sizeof(int8_t))));
-      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 16)));
+      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 16));
       const __m128i vk10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 20)));
+      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 20));
       const __m128i vk10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 260 * sizeof(int8_t))));
       i10 += 24;
 
@@ -407,15 +407,15 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 268 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 16)));
+      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 16));
       const __m128i vk11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 20)));
+      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 20));
       const __m128i vk11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i11 += 24;
 
@@ -428,15 +428,15 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 300 * sizeof(int8_t))));
-      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 16)));
+      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 16));
       const __m128i vk12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 20)));
+      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 20));
       const __m128i vk12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 308 * sizeof(int8_t))));
       i12 += 24;
 
@@ -449,15 +449,15 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 316 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 16)));
+      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 16));
       const __m128i vk13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 20)));
+      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 20));
       const __m128i vk13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i13 += 24;
 
@@ -470,15 +470,15 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 348 * sizeof(int8_t))));
-      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 16)));
+      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 16));
       const __m128i vk14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 20)));
+      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 20));
       const __m128i vk14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 356 * sizeof(int8_t))));
       i14 += 24;
 
@@ -491,15 +491,15 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 364 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 16)));
+      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 16));
       const __m128i vk15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 20)));
+      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 20));
       const __m128i vk15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i15 += 24;
 
@@ -512,15 +512,15 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 396 * sizeof(int8_t))));
-      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 16)));
+      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 16));
       const __m128i vk16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(int8_t))));
-      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 20)));
+      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 20));
       const __m128i vk16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 404 * sizeof(int8_t))));
       i16 += 24;
 
@@ -533,15 +533,15 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 412 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 420 * sizeof(int8_t))));
-      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 16)));
+      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 16));
       const __m128i vk17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(int8_t))));
-      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 20)));
+      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 20));
       const __m128i vk17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 428 * sizeof(int8_t))));
       i17 += 24;
 
@@ -554,15 +554,15 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 436 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 444 * sizeof(int8_t))));
-      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 16)));
+      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 16));
       const __m128i vk18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(int8_t))));
-      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 20)));
+      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 20));
       const __m128i vk18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 452 * sizeof(int8_t))));
       i18 += 24;
 
@@ -575,15 +575,15 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 460 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 468 * sizeof(int8_t))));
-      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 16)));
+      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 16));
       const __m128i vk19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(int8_t))));
-      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 20)));
+      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 20));
       const __m128i vk19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 476 * sizeof(int8_t))));
       i19 += 24;
 
@@ -596,15 +596,15 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 484 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 492 * sizeof(int8_t))));
-      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 16)));
+      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 16));
       const __m128i vk20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(int8_t))));
-      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 20)));
+      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 20));
       const __m128i vk20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 500 * sizeof(int8_t))));
       i20 += 24;
 
@@ -617,15 +617,15 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 508 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 516 * sizeof(int8_t))));
-      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 16)));
+      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 16));
       const __m128i vk21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(int8_t))));
-      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 20)));
+      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 20));
       const __m128i vk21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 524 * sizeof(int8_t))));
       i21 += 24;
 
@@ -638,15 +638,15 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 532 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 540 * sizeof(int8_t))));
-      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 16)));
+      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 16));
       const __m128i vk22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(int8_t))));
-      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 20)));
+      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 20));
       const __m128i vk22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 548 * sizeof(int8_t))));
       i22 += 24;
 
@@ -659,15 +659,15 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 556 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 564 * sizeof(int8_t))));
-      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 16)));
+      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 16));
       const __m128i vk23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(int8_t))));
-      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 20)));
+      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 20));
       const __m128i vk23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 572 * sizeof(int8_t))));
       i23 += 24;
 
@@ -680,15 +680,15 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 580 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 588 * sizeof(int8_t))));
-      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 16)));
+      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 16));
       const __m128i vk24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(int8_t))));
-      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 20)));
+      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 20));
       const __m128i vk24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 596 * sizeof(int8_t))));
       i24 += 24;
 
@@ -752,151 +752,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
 
         vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 216)));
         i9 += 4;
 
         vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i10 += 4;
 
         vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 264)));
         i11 += 4;
 
         vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i12 += 4;
 
         vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 312)));
         i13 += 4;
 
         vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i14 += 4;
 
         vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 360)));
         i15 += 4;
 
         vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i16 += 4;
 
         vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 408)));
         i17 += 4;
 
         vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 432)));
         i18 += 4;
 
         vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 456)));
         i19 += 4;
 
         vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 480)));
         i20 += 4;
 
         vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 504)));
         i21 += 4;
 
         vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 528)));
         i22 += 4;
 
         vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 552)));
         i23 += 4;
 
         vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 576)));
         i24 += 4;
diff --git a/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c b/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
index ede239d..93c0b81 100644
--- a/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
+++ b/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
@@ -91,15 +91,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -112,15 +112,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -133,15 +133,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -154,15 +154,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -175,15 +175,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -196,15 +196,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -217,15 +217,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -238,15 +238,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -259,15 +259,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -331,55 +331,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
diff --git a/src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c b/src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c
index b8c8949..8cd70f3 100644
--- a/src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c
+++ b/src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c
@@ -91,15 +91,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -112,15 +112,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -133,15 +133,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -154,15 +154,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -175,15 +175,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -196,15 +196,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -217,15 +217,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -238,15 +238,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -259,15 +259,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -331,55 +331,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
diff --git a/src/qc8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c b/src/qc8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c
index 51a24df..a5168f7 100644
--- a/src/qc8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c
+++ b/src/qc8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c
@@ -96,15 +96,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -117,15 +117,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -138,15 +138,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -159,15 +159,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -180,15 +180,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -201,15 +201,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -222,15 +222,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -243,15 +243,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -264,15 +264,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -336,55 +336,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
diff --git a/src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c b/src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
index 566f392..bcbe22d 100644
--- a/src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
+++ b/src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
@@ -167,7 +167,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -176,7 +176,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -185,7 +185,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -194,7 +194,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -203,7 +203,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -212,7 +212,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -221,7 +221,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -230,7 +230,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -239,7 +239,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -248,7 +248,7 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i9 += 8;
 
@@ -257,7 +257,7 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(int8_t))));
       i10 += 8;
 
@@ -266,7 +266,7 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i11 += 8;
 
@@ -275,7 +275,7 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(int8_t))));
       i12 += 8;
 
@@ -284,7 +284,7 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i13 += 8;
 
@@ -293,7 +293,7 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i14 += 8;
 
@@ -302,7 +302,7 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i15 += 8;
 
@@ -311,7 +311,7 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(int8_t))));
       i16 += 8;
 
@@ -320,7 +320,7 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i17 += 8;
 
@@ -329,7 +329,7 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(int8_t))));
       i18 += 8;
 
@@ -338,7 +338,7 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i19 += 8;
 
@@ -347,7 +347,7 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i20 += 8;
 
@@ -356,7 +356,7 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i21 += 8;
 
@@ -365,7 +365,7 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(int8_t))));
       i22 += 8;
 
@@ -374,7 +374,7 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i23 += 8;
 
@@ -383,7 +383,7 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(int8_t))));
       i24 += 8;
 
@@ -421,151 +421,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 88)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 104)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 136)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 152)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 184)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i24 += 4;
diff --git a/src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c b/src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
index 2833bf0..bdd4fff 100644
--- a/src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
+++ b/src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
@@ -167,7 +167,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -176,7 +176,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -185,7 +185,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -194,7 +194,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -203,7 +203,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -212,7 +212,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -221,7 +221,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -230,7 +230,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -239,7 +239,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -248,7 +248,7 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i9 += 8;
 
@@ -257,7 +257,7 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(int8_t))));
       i10 += 8;
 
@@ -266,7 +266,7 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i11 += 8;
 
@@ -275,7 +275,7 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(int8_t))));
       i12 += 8;
 
@@ -284,7 +284,7 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i13 += 8;
 
@@ -293,7 +293,7 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i14 += 8;
 
@@ -302,7 +302,7 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i15 += 8;
 
@@ -311,7 +311,7 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(int8_t))));
       i16 += 8;
 
@@ -320,7 +320,7 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i17 += 8;
 
@@ -329,7 +329,7 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(int8_t))));
       i18 += 8;
 
@@ -338,7 +338,7 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i19 += 8;
 
@@ -347,7 +347,7 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i20 += 8;
 
@@ -356,7 +356,7 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i21 += 8;
 
@@ -365,7 +365,7 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(int8_t))));
       i22 += 8;
 
@@ -374,7 +374,7 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i23 += 8;
 
@@ -383,7 +383,7 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(int8_t))));
       i24 += 8;
 
@@ -421,151 +421,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 88)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 104)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 136)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 152)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 184)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i24 += 4;
diff --git a/src/qc8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c b/src/qc8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
index 2a154b8..8363678 100644
--- a/src/qc8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
+++ b/src/qc8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
@@ -172,7 +172,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -181,7 +181,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -190,7 +190,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -199,7 +199,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -208,7 +208,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -217,7 +217,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -226,7 +226,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -235,7 +235,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -244,7 +244,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -253,7 +253,7 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i9 += 8;
 
@@ -262,7 +262,7 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(int8_t))));
       i10 += 8;
 
@@ -271,7 +271,7 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i11 += 8;
 
@@ -280,7 +280,7 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(int8_t))));
       i12 += 8;
 
@@ -289,7 +289,7 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i13 += 8;
 
@@ -298,7 +298,7 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i14 += 8;
 
@@ -307,7 +307,7 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i15 += 8;
 
@@ -316,7 +316,7 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(int8_t))));
       i16 += 8;
 
@@ -325,7 +325,7 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i17 += 8;
 
@@ -334,7 +334,7 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(int8_t))));
       i18 += 8;
 
@@ -343,7 +343,7 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i19 += 8;
 
@@ -352,7 +352,7 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i20 += 8;
 
@@ -361,7 +361,7 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i21 += 8;
 
@@ -370,7 +370,7 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(int8_t))));
       i22 += 8;
 
@@ -379,7 +379,7 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i23 += 8;
 
@@ -388,7 +388,7 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(int8_t))));
       i24 += 8;
 
@@ -426,151 +426,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
 
         vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i9 += 4;
 
         vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i10 += 4;
 
         vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 88)));
         i11 += 4;
 
         vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i12 += 4;
 
         vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 104)));
         i13 += 4;
 
         vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i14 += 4;
 
         vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i15 += 4;
 
         vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i16 += 4;
 
         vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 136)));
         i17 += 4;
 
         vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i18 += 4;
 
         vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 152)));
         i19 += 4;
 
         vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i20 += 4;
 
         vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i21 += 4;
 
         vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i22 += 4;
 
         vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 184)));
         i23 += 4;
 
         vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i24 += 4;
diff --git a/src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c b/src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
index 1ae035f..6594d00 100644
--- a/src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
+++ b/src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
@@ -87,7 +87,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -96,7 +96,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -105,7 +105,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -114,7 +114,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -123,7 +123,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -132,7 +132,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -141,7 +141,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -150,7 +150,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -159,7 +159,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -197,55 +197,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
diff --git a/src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c b/src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
index 24e07eb..62449cd 100644
--- a/src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
+++ b/src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
@@ -87,7 +87,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -96,7 +96,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -105,7 +105,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -114,7 +114,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -123,7 +123,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -132,7 +132,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -141,7 +141,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -150,7 +150,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -159,7 +159,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -197,55 +197,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
diff --git a/src/qc8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c b/src/qc8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
index 66d660b..0d57a38 100644
--- a/src/qc8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
+++ b/src/qc8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
@@ -92,7 +92,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -101,7 +101,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -110,7 +110,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -119,7 +119,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -128,7 +128,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -137,7 +137,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -146,7 +146,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -155,7 +155,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -164,7 +164,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -202,55 +202,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
index 6f6b7c2..9285304 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
@@ -169,11 +169,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -184,11 +184,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -199,11 +199,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -214,11 +214,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -229,11 +229,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -244,11 +244,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -259,11 +259,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -274,11 +274,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -289,11 +289,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -304,11 +304,11 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i9 += 16;
 
@@ -319,11 +319,11 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i10 += 16;
 
@@ -334,11 +334,11 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i11 += 16;
 
@@ -349,11 +349,11 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(int8_t))));
       i12 += 16;
 
@@ -364,11 +364,11 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(int8_t))));
       i13 += 16;
 
@@ -379,11 +379,11 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i14 += 16;
 
@@ -394,11 +394,11 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(int8_t))));
       i15 += 16;
 
@@ -409,11 +409,11 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(int8_t))));
       i16 += 16;
 
@@ -424,11 +424,11 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i17 += 16;
 
@@ -439,11 +439,11 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(int8_t))));
       i18 += 16;
 
@@ -454,11 +454,11 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(int8_t))));
       i19 += 16;
 
@@ -469,11 +469,11 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i20 += 16;
 
@@ -484,11 +484,11 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(int8_t))));
       i21 += 16;
 
@@ -499,11 +499,11 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(int8_t))));
       i22 += 16;
 
@@ -514,11 +514,11 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i23 += 16;
 
@@ -529,11 +529,11 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(int8_t))));
       i24 += 16;
 
@@ -578,151 +578,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 208)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 224)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 256)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 272)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 304)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 320)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 352)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 368)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
index 98652e9..13cdcd5 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
@@ -169,11 +169,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -184,11 +184,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -199,11 +199,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -214,11 +214,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -229,11 +229,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -244,11 +244,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -259,11 +259,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -274,11 +274,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -289,11 +289,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -304,11 +304,11 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i9 += 16;
 
@@ -319,11 +319,11 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i10 += 16;
 
@@ -334,11 +334,11 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i11 += 16;
 
@@ -349,11 +349,11 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(int8_t))));
       i12 += 16;
 
@@ -364,11 +364,11 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(int8_t))));
       i13 += 16;
 
@@ -379,11 +379,11 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i14 += 16;
 
@@ -394,11 +394,11 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(int8_t))));
       i15 += 16;
 
@@ -409,11 +409,11 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(int8_t))));
       i16 += 16;
 
@@ -424,11 +424,11 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i17 += 16;
 
@@ -439,11 +439,11 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(int8_t))));
       i18 += 16;
 
@@ -454,11 +454,11 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(int8_t))));
       i19 += 16;
 
@@ -469,11 +469,11 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i20 += 16;
 
@@ -484,11 +484,11 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(int8_t))));
       i21 += 16;
 
@@ -499,11 +499,11 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(int8_t))));
       i22 += 16;
 
@@ -514,11 +514,11 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i23 += 16;
 
@@ -529,11 +529,11 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(int8_t))));
       i24 += 16;
 
@@ -578,151 +578,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 208)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 224)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 256)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 272)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 304)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 320)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 352)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 368)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
index 3f44573..c2ff8bd 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
@@ -174,11 +174,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -189,11 +189,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -204,11 +204,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -219,11 +219,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -234,11 +234,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -249,11 +249,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -264,11 +264,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -279,11 +279,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -294,11 +294,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -309,11 +309,11 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i9 += 16;
 
@@ -324,11 +324,11 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i10 += 16;
 
@@ -339,11 +339,11 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i11 += 16;
 
@@ -354,11 +354,11 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(int8_t))));
       i12 += 16;
 
@@ -369,11 +369,11 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(int8_t))));
       i13 += 16;
 
@@ -384,11 +384,11 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i14 += 16;
 
@@ -399,11 +399,11 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(int8_t))));
       i15 += 16;
 
@@ -414,11 +414,11 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(int8_t))));
       i16 += 16;
 
@@ -429,11 +429,11 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i17 += 16;
 
@@ -444,11 +444,11 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(int8_t))));
       i18 += 16;
 
@@ -459,11 +459,11 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(int8_t))));
       i19 += 16;
 
@@ -474,11 +474,11 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i20 += 16;
 
@@ -489,11 +489,11 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(int8_t))));
       i21 += 16;
 
@@ -504,11 +504,11 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(int8_t))));
       i22 += 16;
 
@@ -519,11 +519,11 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i23 += 16;
 
@@ -534,11 +534,11 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(int8_t))));
       i24 += 16;
 
@@ -583,151 +583,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
 
         vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i9 += 4;
 
         vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i10 += 4;
 
         vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i11 += 4;
 
         vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i12 += 4;
 
         vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 208)));
         i13 += 4;
 
         vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 224)));
         i14 += 4;
 
         vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i15 += 4;
 
         vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 256)));
         i16 += 4;
 
         vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 272)));
         i17 += 4;
 
         vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i18 += 4;
 
         vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 304)));
         i19 += 4;
 
         vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 320)));
         i20 += 4;
 
         vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i21 += 4;
 
         vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 352)));
         i22 += 4;
 
         vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 368)));
         i23 += 4;
 
         vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c
index dbe7341..e81b614 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c
@@ -169,11 +169,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -184,11 +184,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -199,11 +199,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -214,11 +214,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -229,11 +229,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -244,11 +244,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -259,11 +259,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -274,11 +274,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -289,11 +289,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -304,11 +304,11 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i9 += 16;
 
@@ -319,11 +319,11 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i10 += 16;
 
@@ -334,11 +334,11 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i11 += 16;
 
@@ -349,11 +349,11 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(int8_t))));
       i12 += 16;
 
@@ -364,11 +364,11 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(int8_t))));
       i13 += 16;
 
@@ -379,11 +379,11 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i14 += 16;
 
@@ -394,11 +394,11 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(int8_t))));
       i15 += 16;
 
@@ -409,11 +409,11 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(int8_t))));
       i16 += 16;
 
@@ -424,11 +424,11 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i17 += 16;
 
@@ -439,11 +439,11 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(int8_t))));
       i18 += 16;
 
@@ -454,11 +454,11 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(int8_t))));
       i19 += 16;
 
@@ -469,11 +469,11 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i20 += 16;
 
@@ -484,11 +484,11 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(int8_t))));
       i21 += 16;
 
@@ -499,11 +499,11 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(int8_t))));
       i22 += 16;
 
@@ -514,11 +514,11 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i23 += 16;
 
@@ -529,11 +529,11 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(int8_t))));
       i24 += 16;
 
@@ -613,151 +613,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 208)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 224)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 256)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 272)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 304)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 320)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 352)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 368)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c
index 1ea33c3..e9e8910 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c
@@ -169,11 +169,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -184,11 +184,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -199,11 +199,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -214,11 +214,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -229,11 +229,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -244,11 +244,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -259,11 +259,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -274,11 +274,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -289,11 +289,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -304,11 +304,11 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i9 += 16;
 
@@ -319,11 +319,11 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i10 += 16;
 
@@ -334,11 +334,11 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i11 += 16;
 
@@ -349,11 +349,11 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(int8_t))));
       i12 += 16;
 
@@ -364,11 +364,11 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(int8_t))));
       i13 += 16;
 
@@ -379,11 +379,11 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i14 += 16;
 
@@ -394,11 +394,11 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(int8_t))));
       i15 += 16;
 
@@ -409,11 +409,11 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(int8_t))));
       i16 += 16;
 
@@ -424,11 +424,11 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i17 += 16;
 
@@ -439,11 +439,11 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(int8_t))));
       i18 += 16;
 
@@ -454,11 +454,11 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(int8_t))));
       i19 += 16;
 
@@ -469,11 +469,11 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i20 += 16;
 
@@ -484,11 +484,11 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(int8_t))));
       i21 += 16;
 
@@ -499,11 +499,11 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(int8_t))));
       i22 += 16;
 
@@ -514,11 +514,11 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i23 += 16;
 
@@ -529,11 +529,11 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(int8_t))));
       i24 += 16;
 
@@ -613,151 +613,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 208)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 224)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 256)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 272)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 304)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 320)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 352)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 368)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c
index 5093cf0..4f3bb8b 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c
@@ -174,11 +174,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -189,11 +189,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -204,11 +204,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -219,11 +219,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -234,11 +234,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -249,11 +249,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -264,11 +264,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -279,11 +279,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -294,11 +294,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -309,11 +309,11 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i9 += 16;
 
@@ -324,11 +324,11 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i10 += 16;
 
@@ -339,11 +339,11 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i11 += 16;
 
@@ -354,11 +354,11 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(int8_t))));
       i12 += 16;
 
@@ -369,11 +369,11 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(int8_t))));
       i13 += 16;
 
@@ -384,11 +384,11 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i14 += 16;
 
@@ -399,11 +399,11 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(int8_t))));
       i15 += 16;
 
@@ -414,11 +414,11 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(int8_t))));
       i16 += 16;
 
@@ -429,11 +429,11 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i17 += 16;
 
@@ -444,11 +444,11 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(int8_t))));
       i18 += 16;
 
@@ -459,11 +459,11 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(int8_t))));
       i19 += 16;
 
@@ -474,11 +474,11 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i20 += 16;
 
@@ -489,11 +489,11 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(int8_t))));
       i21 += 16;
 
@@ -504,11 +504,11 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(int8_t))));
       i22 += 16;
 
@@ -519,11 +519,11 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i23 += 16;
 
@@ -534,11 +534,11 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(int8_t))));
       i24 += 16;
 
@@ -618,151 +618,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
 
         vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i9 += 4;
 
         vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i10 += 4;
 
         vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i11 += 4;
 
         vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i12 += 4;
 
         vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 208)));
         i13 += 4;
 
         vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 224)));
         i14 += 4;
 
         vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i15 += 4;
 
         vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 256)));
         i16 += 4;
 
         vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 272)));
         i17 += 4;
 
         vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i18 += 4;
 
         vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 304)));
         i19 += 4;
 
         vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 320)));
         i20 += 4;
 
         vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i21 += 4;
 
         vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 352)));
         i22 += 4;
 
         vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 368)));
         i23 += 4;
 
         vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
index d1592b6..5b2cca7 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
@@ -89,11 +89,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -104,11 +104,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -119,11 +119,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -134,11 +134,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -149,11 +149,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -164,11 +164,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -179,11 +179,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -194,11 +194,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -209,11 +209,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -258,55 +258,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
index ef07590..2dc639e 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
@@ -89,11 +89,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -104,11 +104,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -119,11 +119,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -134,11 +134,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -149,11 +149,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -164,11 +164,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -179,11 +179,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -194,11 +194,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -209,11 +209,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -258,55 +258,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
index 35ff8f6..156d30c 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
@@ -94,11 +94,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -109,11 +109,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -124,11 +124,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -139,11 +139,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -154,11 +154,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -169,11 +169,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -184,11 +184,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -199,11 +199,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -214,11 +214,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -263,55 +263,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c
index 34a8a20..741f6b1 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c
@@ -89,11 +89,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -104,11 +104,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -119,11 +119,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -134,11 +134,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -149,11 +149,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -164,11 +164,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -179,11 +179,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -194,11 +194,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -209,11 +209,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -293,55 +293,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c
index 68533a2..2acd4fe 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c
@@ -89,11 +89,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -104,11 +104,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -119,11 +119,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -134,11 +134,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -149,11 +149,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -164,11 +164,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -179,11 +179,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -194,11 +194,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -209,11 +209,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -293,55 +293,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c
index d3fe2ae..bc17656 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c
@@ -94,11 +94,11 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i0 += 16;
 
@@ -109,11 +109,11 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i1 += 16;
 
@@ -124,11 +124,11 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i2 += 16;
 
@@ -139,11 +139,11 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i3 += 16;
 
@@ -154,11 +154,11 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i4 += 16;
 
@@ -169,11 +169,11 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i5 += 16;
 
@@ -184,11 +184,11 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i6 += 16;
 
@@ -199,11 +199,11 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i7 += 16;
 
@@ -214,11 +214,11 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i8 += 16;
 
@@ -298,55 +298,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c
index fa27362..c10d091 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c
@@ -171,15 +171,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -192,15 +192,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -213,15 +213,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -234,15 +234,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -255,15 +255,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -276,15 +276,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -297,15 +297,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -318,15 +318,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -339,15 +339,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -360,15 +360,15 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 220 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 16)));
+      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 16));
       const __m128i vk9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 20)));
+      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 20));
       const __m128i vk9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i9 += 24;
 
@@ -381,15 +381,15 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 252 * sizeof(int8_t))));
-      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 16)));
+      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 16));
       const __m128i vk10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 20)));
+      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 20));
       const __m128i vk10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 260 * sizeof(int8_t))));
       i10 += 24;
 
@@ -402,15 +402,15 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 268 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 16)));
+      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 16));
       const __m128i vk11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 20)));
+      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 20));
       const __m128i vk11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i11 += 24;
 
@@ -423,15 +423,15 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 300 * sizeof(int8_t))));
-      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 16)));
+      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 16));
       const __m128i vk12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 20)));
+      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 20));
       const __m128i vk12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 308 * sizeof(int8_t))));
       i12 += 24;
 
@@ -444,15 +444,15 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 316 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 16)));
+      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 16));
       const __m128i vk13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 20)));
+      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 20));
       const __m128i vk13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i13 += 24;
 
@@ -465,15 +465,15 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 348 * sizeof(int8_t))));
-      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 16)));
+      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 16));
       const __m128i vk14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 20)));
+      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 20));
       const __m128i vk14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 356 * sizeof(int8_t))));
       i14 += 24;
 
@@ -486,15 +486,15 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 364 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 16)));
+      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 16));
       const __m128i vk15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 20)));
+      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 20));
       const __m128i vk15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i15 += 24;
 
@@ -507,15 +507,15 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 396 * sizeof(int8_t))));
-      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 16)));
+      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 16));
       const __m128i vk16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(int8_t))));
-      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 20)));
+      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 20));
       const __m128i vk16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 404 * sizeof(int8_t))));
       i16 += 24;
 
@@ -528,15 +528,15 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 412 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 420 * sizeof(int8_t))));
-      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 16)));
+      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 16));
       const __m128i vk17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(int8_t))));
-      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 20)));
+      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 20));
       const __m128i vk17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 428 * sizeof(int8_t))));
       i17 += 24;
 
@@ -549,15 +549,15 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 436 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 444 * sizeof(int8_t))));
-      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 16)));
+      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 16));
       const __m128i vk18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(int8_t))));
-      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 20)));
+      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 20));
       const __m128i vk18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 452 * sizeof(int8_t))));
       i18 += 24;
 
@@ -570,15 +570,15 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 460 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 468 * sizeof(int8_t))));
-      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 16)));
+      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 16));
       const __m128i vk19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(int8_t))));
-      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 20)));
+      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 20));
       const __m128i vk19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 476 * sizeof(int8_t))));
       i19 += 24;
 
@@ -591,15 +591,15 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 484 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 492 * sizeof(int8_t))));
-      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 16)));
+      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 16));
       const __m128i vk20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(int8_t))));
-      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 20)));
+      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 20));
       const __m128i vk20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 500 * sizeof(int8_t))));
       i20 += 24;
 
@@ -612,15 +612,15 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 508 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 516 * sizeof(int8_t))));
-      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 16)));
+      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 16));
       const __m128i vk21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(int8_t))));
-      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 20)));
+      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 20));
       const __m128i vk21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 524 * sizeof(int8_t))));
       i21 += 24;
 
@@ -633,15 +633,15 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 532 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 540 * sizeof(int8_t))));
-      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 16)));
+      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 16));
       const __m128i vk22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(int8_t))));
-      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 20)));
+      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 20));
       const __m128i vk22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 548 * sizeof(int8_t))));
       i22 += 24;
 
@@ -654,15 +654,15 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 556 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 564 * sizeof(int8_t))));
-      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 16)));
+      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 16));
       const __m128i vk23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(int8_t))));
-      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 20)));
+      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 20));
       const __m128i vk23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 572 * sizeof(int8_t))));
       i23 += 24;
 
@@ -675,15 +675,15 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 580 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 588 * sizeof(int8_t))));
-      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 16)));
+      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 16));
       const __m128i vk24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(int8_t))));
-      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 20)));
+      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 20));
       const __m128i vk24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 596 * sizeof(int8_t))));
       i24 += 24;
 
@@ -741,151 +741,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 216)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 264)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 312)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 360)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 408)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 432)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 456)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 480)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 504)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 528)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 552)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 576)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c
index 105c54e..daddfe3 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c
@@ -171,15 +171,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -192,15 +192,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -213,15 +213,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -234,15 +234,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -255,15 +255,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -276,15 +276,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -297,15 +297,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -318,15 +318,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -339,15 +339,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -360,15 +360,15 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 220 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 16)));
+      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 16));
       const __m128i vk9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 20)));
+      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 20));
       const __m128i vk9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i9 += 24;
 
@@ -381,15 +381,15 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 252 * sizeof(int8_t))));
-      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 16)));
+      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 16));
       const __m128i vk10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 20)));
+      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 20));
       const __m128i vk10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 260 * sizeof(int8_t))));
       i10 += 24;
 
@@ -402,15 +402,15 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 268 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 16)));
+      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 16));
       const __m128i vk11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 20)));
+      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 20));
       const __m128i vk11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i11 += 24;
 
@@ -423,15 +423,15 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 300 * sizeof(int8_t))));
-      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 16)));
+      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 16));
       const __m128i vk12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 20)));
+      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 20));
       const __m128i vk12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 308 * sizeof(int8_t))));
       i12 += 24;
 
@@ -444,15 +444,15 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 316 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 16)));
+      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 16));
       const __m128i vk13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 20)));
+      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 20));
       const __m128i vk13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i13 += 24;
 
@@ -465,15 +465,15 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 348 * sizeof(int8_t))));
-      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 16)));
+      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 16));
       const __m128i vk14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 20)));
+      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 20));
       const __m128i vk14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 356 * sizeof(int8_t))));
       i14 += 24;
 
@@ -486,15 +486,15 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 364 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 16)));
+      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 16));
       const __m128i vk15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 20)));
+      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 20));
       const __m128i vk15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i15 += 24;
 
@@ -507,15 +507,15 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 396 * sizeof(int8_t))));
-      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 16)));
+      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 16));
       const __m128i vk16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(int8_t))));
-      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 20)));
+      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 20));
       const __m128i vk16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 404 * sizeof(int8_t))));
       i16 += 24;
 
@@ -528,15 +528,15 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 412 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 420 * sizeof(int8_t))));
-      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 16)));
+      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 16));
       const __m128i vk17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(int8_t))));
-      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 20)));
+      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 20));
       const __m128i vk17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 428 * sizeof(int8_t))));
       i17 += 24;
 
@@ -549,15 +549,15 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 436 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 444 * sizeof(int8_t))));
-      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 16)));
+      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 16));
       const __m128i vk18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(int8_t))));
-      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 20)));
+      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 20));
       const __m128i vk18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 452 * sizeof(int8_t))));
       i18 += 24;
 
@@ -570,15 +570,15 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 460 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 468 * sizeof(int8_t))));
-      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 16)));
+      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 16));
       const __m128i vk19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(int8_t))));
-      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 20)));
+      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 20));
       const __m128i vk19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 476 * sizeof(int8_t))));
       i19 += 24;
 
@@ -591,15 +591,15 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 484 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 492 * sizeof(int8_t))));
-      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 16)));
+      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 16));
       const __m128i vk20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(int8_t))));
-      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 20)));
+      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 20));
       const __m128i vk20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 500 * sizeof(int8_t))));
       i20 += 24;
 
@@ -612,15 +612,15 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 508 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 516 * sizeof(int8_t))));
-      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 16)));
+      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 16));
       const __m128i vk21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(int8_t))));
-      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 20)));
+      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 20));
       const __m128i vk21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 524 * sizeof(int8_t))));
       i21 += 24;
 
@@ -633,15 +633,15 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 532 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 540 * sizeof(int8_t))));
-      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 16)));
+      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 16));
       const __m128i vk22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(int8_t))));
-      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 20)));
+      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 20));
       const __m128i vk22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 548 * sizeof(int8_t))));
       i22 += 24;
 
@@ -654,15 +654,15 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 556 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 564 * sizeof(int8_t))));
-      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 16)));
+      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 16));
       const __m128i vk23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(int8_t))));
-      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 20)));
+      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 20));
       const __m128i vk23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 572 * sizeof(int8_t))));
       i23 += 24;
 
@@ -675,15 +675,15 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 580 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 588 * sizeof(int8_t))));
-      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 16)));
+      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 16));
       const __m128i vk24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(int8_t))));
-      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 20)));
+      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 20));
       const __m128i vk24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 596 * sizeof(int8_t))));
       i24 += 24;
 
@@ -741,151 +741,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 216)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 264)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 312)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 360)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 408)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 432)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 456)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 480)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 504)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 528)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 552)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 576)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c
index 476e4e0..a1ba12b 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c
@@ -176,15 +176,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -197,15 +197,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -218,15 +218,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -239,15 +239,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -260,15 +260,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -281,15 +281,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -302,15 +302,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -323,15 +323,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -344,15 +344,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -365,15 +365,15 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 220 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 16)));
+      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 16));
       const __m128i vk9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 20)));
+      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 20));
       const __m128i vk9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i9 += 24;
 
@@ -386,15 +386,15 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 252 * sizeof(int8_t))));
-      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 16)));
+      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 16));
       const __m128i vk10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 20)));
+      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 20));
       const __m128i vk10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 260 * sizeof(int8_t))));
       i10 += 24;
 
@@ -407,15 +407,15 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 268 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 16)));
+      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 16));
       const __m128i vk11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 20)));
+      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 20));
       const __m128i vk11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i11 += 24;
 
@@ -428,15 +428,15 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 300 * sizeof(int8_t))));
-      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 16)));
+      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 16));
       const __m128i vk12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 20)));
+      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 20));
       const __m128i vk12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 308 * sizeof(int8_t))));
       i12 += 24;
 
@@ -449,15 +449,15 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 316 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 16)));
+      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 16));
       const __m128i vk13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 20)));
+      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 20));
       const __m128i vk13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i13 += 24;
 
@@ -470,15 +470,15 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 348 * sizeof(int8_t))));
-      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 16)));
+      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 16));
       const __m128i vk14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 20)));
+      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 20));
       const __m128i vk14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 356 * sizeof(int8_t))));
       i14 += 24;
 
@@ -491,15 +491,15 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 364 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 16)));
+      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 16));
       const __m128i vk15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 20)));
+      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 20));
       const __m128i vk15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i15 += 24;
 
@@ -512,15 +512,15 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 396 * sizeof(int8_t))));
-      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 16)));
+      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 16));
       const __m128i vk16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(int8_t))));
-      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 20)));
+      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 20));
       const __m128i vk16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 404 * sizeof(int8_t))));
       i16 += 24;
 
@@ -533,15 +533,15 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 412 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 420 * sizeof(int8_t))));
-      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 16)));
+      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 16));
       const __m128i vk17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(int8_t))));
-      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 20)));
+      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 20));
       const __m128i vk17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 428 * sizeof(int8_t))));
       i17 += 24;
 
@@ -554,15 +554,15 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 436 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 444 * sizeof(int8_t))));
-      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 16)));
+      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 16));
       const __m128i vk18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(int8_t))));
-      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 20)));
+      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 20));
       const __m128i vk18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 452 * sizeof(int8_t))));
       i18 += 24;
 
@@ -575,15 +575,15 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 460 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 468 * sizeof(int8_t))));
-      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 16)));
+      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 16));
       const __m128i vk19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(int8_t))));
-      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 20)));
+      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 20));
       const __m128i vk19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 476 * sizeof(int8_t))));
       i19 += 24;
 
@@ -596,15 +596,15 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 484 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 492 * sizeof(int8_t))));
-      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 16)));
+      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 16));
       const __m128i vk20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(int8_t))));
-      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 20)));
+      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 20));
       const __m128i vk20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 500 * sizeof(int8_t))));
       i20 += 24;
 
@@ -617,15 +617,15 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 508 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 516 * sizeof(int8_t))));
-      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 16)));
+      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 16));
       const __m128i vk21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(int8_t))));
-      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 20)));
+      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 20));
       const __m128i vk21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 524 * sizeof(int8_t))));
       i21 += 24;
 
@@ -638,15 +638,15 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 532 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 540 * sizeof(int8_t))));
-      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 16)));
+      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 16));
       const __m128i vk22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(int8_t))));
-      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 20)));
+      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 20));
       const __m128i vk22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 548 * sizeof(int8_t))));
       i22 += 24;
 
@@ -659,15 +659,15 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 556 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 564 * sizeof(int8_t))));
-      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 16)));
+      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 16));
       const __m128i vk23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(int8_t))));
-      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 20)));
+      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 20));
       const __m128i vk23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 572 * sizeof(int8_t))));
       i23 += 24;
 
@@ -680,15 +680,15 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 580 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 588 * sizeof(int8_t))));
-      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 16)));
+      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 16));
       const __m128i vk24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(int8_t))));
-      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 20)));
+      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 20));
       const __m128i vk24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 596 * sizeof(int8_t))));
       i24 += 24;
 
@@ -746,151 +746,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
 
         vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 216)));
         i9 += 4;
 
         vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i10 += 4;
 
         vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 264)));
         i11 += 4;
 
         vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i12 += 4;
 
         vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 312)));
         i13 += 4;
 
         vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i14 += 4;
 
         vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 360)));
         i15 += 4;
 
         vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i16 += 4;
 
         vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 408)));
         i17 += 4;
 
         vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 432)));
         i18 += 4;
 
         vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 456)));
         i19 += 4;
 
         vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 480)));
         i20 += 4;
 
         vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 504)));
         i21 += 4;
 
         vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 528)));
         i22 += 4;
 
         vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 552)));
         i23 += 4;
 
         vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 576)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c
index 4023ab2..caa1556 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c
@@ -171,15 +171,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -192,15 +192,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -213,15 +213,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -234,15 +234,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -255,15 +255,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -276,15 +276,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -297,15 +297,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -318,15 +318,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -339,15 +339,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -360,15 +360,15 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 220 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 16)));
+      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 16));
       const __m128i vk9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 20)));
+      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 20));
       const __m128i vk9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i9 += 24;
 
@@ -381,15 +381,15 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 252 * sizeof(int8_t))));
-      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 16)));
+      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 16));
       const __m128i vk10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 20)));
+      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 20));
       const __m128i vk10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 260 * sizeof(int8_t))));
       i10 += 24;
 
@@ -402,15 +402,15 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 268 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 16)));
+      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 16));
       const __m128i vk11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 20)));
+      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 20));
       const __m128i vk11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i11 += 24;
 
@@ -423,15 +423,15 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 300 * sizeof(int8_t))));
-      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 16)));
+      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 16));
       const __m128i vk12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 20)));
+      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 20));
       const __m128i vk12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 308 * sizeof(int8_t))));
       i12 += 24;
 
@@ -444,15 +444,15 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 316 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 16)));
+      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 16));
       const __m128i vk13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 20)));
+      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 20));
       const __m128i vk13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i13 += 24;
 
@@ -465,15 +465,15 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 348 * sizeof(int8_t))));
-      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 16)));
+      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 16));
       const __m128i vk14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 20)));
+      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 20));
       const __m128i vk14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 356 * sizeof(int8_t))));
       i14 += 24;
 
@@ -486,15 +486,15 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 364 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 16)));
+      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 16));
       const __m128i vk15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 20)));
+      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 20));
       const __m128i vk15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i15 += 24;
 
@@ -507,15 +507,15 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 396 * sizeof(int8_t))));
-      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 16)));
+      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 16));
       const __m128i vk16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(int8_t))));
-      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 20)));
+      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 20));
       const __m128i vk16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 404 * sizeof(int8_t))));
       i16 += 24;
 
@@ -528,15 +528,15 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 412 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 420 * sizeof(int8_t))));
-      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 16)));
+      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 16));
       const __m128i vk17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(int8_t))));
-      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 20)));
+      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 20));
       const __m128i vk17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 428 * sizeof(int8_t))));
       i17 += 24;
 
@@ -549,15 +549,15 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 436 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 444 * sizeof(int8_t))));
-      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 16)));
+      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 16));
       const __m128i vk18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(int8_t))));
-      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 20)));
+      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 20));
       const __m128i vk18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 452 * sizeof(int8_t))));
       i18 += 24;
 
@@ -570,15 +570,15 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 460 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 468 * sizeof(int8_t))));
-      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 16)));
+      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 16));
       const __m128i vk19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(int8_t))));
-      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 20)));
+      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 20));
       const __m128i vk19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 476 * sizeof(int8_t))));
       i19 += 24;
 
@@ -591,15 +591,15 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 484 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 492 * sizeof(int8_t))));
-      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 16)));
+      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 16));
       const __m128i vk20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(int8_t))));
-      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 20)));
+      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 20));
       const __m128i vk20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 500 * sizeof(int8_t))));
       i20 += 24;
 
@@ -612,15 +612,15 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 508 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 516 * sizeof(int8_t))));
-      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 16)));
+      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 16));
       const __m128i vk21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(int8_t))));
-      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 20)));
+      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 20));
       const __m128i vk21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 524 * sizeof(int8_t))));
       i21 += 24;
 
@@ -633,15 +633,15 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 532 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 540 * sizeof(int8_t))));
-      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 16)));
+      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 16));
       const __m128i vk22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(int8_t))));
-      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 20)));
+      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 20));
       const __m128i vk22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 548 * sizeof(int8_t))));
       i22 += 24;
 
@@ -654,15 +654,15 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 556 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 564 * sizeof(int8_t))));
-      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 16)));
+      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 16));
       const __m128i vk23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(int8_t))));
-      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 20)));
+      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 20));
       const __m128i vk23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 572 * sizeof(int8_t))));
       i23 += 24;
 
@@ -675,15 +675,15 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 580 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 588 * sizeof(int8_t))));
-      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 16)));
+      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 16));
       const __m128i vk24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(int8_t))));
-      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 20)));
+      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 20));
       const __m128i vk24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 596 * sizeof(int8_t))));
       i24 += 24;
 
@@ -790,151 +790,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 216)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 264)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 312)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 360)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 408)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 432)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 456)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 480)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 504)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 528)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 552)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 576)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul32.c
index acbdb13..6314e81 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul32.c
@@ -171,15 +171,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -192,15 +192,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -213,15 +213,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -234,15 +234,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -255,15 +255,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -276,15 +276,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -297,15 +297,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -318,15 +318,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -339,15 +339,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -360,15 +360,15 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 220 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 16)));
+      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 16));
       const __m128i vk9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 20)));
+      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 20));
       const __m128i vk9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i9 += 24;
 
@@ -381,15 +381,15 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 252 * sizeof(int8_t))));
-      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 16)));
+      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 16));
       const __m128i vk10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 20)));
+      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 20));
       const __m128i vk10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 260 * sizeof(int8_t))));
       i10 += 24;
 
@@ -402,15 +402,15 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 268 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 16)));
+      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 16));
       const __m128i vk11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 20)));
+      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 20));
       const __m128i vk11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i11 += 24;
 
@@ -423,15 +423,15 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 300 * sizeof(int8_t))));
-      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 16)));
+      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 16));
       const __m128i vk12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 20)));
+      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 20));
       const __m128i vk12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 308 * sizeof(int8_t))));
       i12 += 24;
 
@@ -444,15 +444,15 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 316 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 16)));
+      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 16));
       const __m128i vk13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 20)));
+      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 20));
       const __m128i vk13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i13 += 24;
 
@@ -465,15 +465,15 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 348 * sizeof(int8_t))));
-      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 16)));
+      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 16));
       const __m128i vk14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 20)));
+      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 20));
       const __m128i vk14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 356 * sizeof(int8_t))));
       i14 += 24;
 
@@ -486,15 +486,15 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 364 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 16)));
+      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 16));
       const __m128i vk15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 20)));
+      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 20));
       const __m128i vk15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i15 += 24;
 
@@ -507,15 +507,15 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 396 * sizeof(int8_t))));
-      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 16)));
+      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 16));
       const __m128i vk16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(int8_t))));
-      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 20)));
+      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 20));
       const __m128i vk16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 404 * sizeof(int8_t))));
       i16 += 24;
 
@@ -528,15 +528,15 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 412 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 420 * sizeof(int8_t))));
-      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 16)));
+      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 16));
       const __m128i vk17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(int8_t))));
-      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 20)));
+      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 20));
       const __m128i vk17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 428 * sizeof(int8_t))));
       i17 += 24;
 
@@ -549,15 +549,15 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 436 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 444 * sizeof(int8_t))));
-      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 16)));
+      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 16));
       const __m128i vk18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(int8_t))));
-      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 20)));
+      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 20));
       const __m128i vk18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 452 * sizeof(int8_t))));
       i18 += 24;
 
@@ -570,15 +570,15 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 460 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 468 * sizeof(int8_t))));
-      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 16)));
+      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 16));
       const __m128i vk19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(int8_t))));
-      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 20)));
+      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 20));
       const __m128i vk19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 476 * sizeof(int8_t))));
       i19 += 24;
 
@@ -591,15 +591,15 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 484 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 492 * sizeof(int8_t))));
-      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 16)));
+      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 16));
       const __m128i vk20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(int8_t))));
-      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 20)));
+      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 20));
       const __m128i vk20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 500 * sizeof(int8_t))));
       i20 += 24;
 
@@ -612,15 +612,15 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 508 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 516 * sizeof(int8_t))));
-      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 16)));
+      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 16));
       const __m128i vk21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(int8_t))));
-      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 20)));
+      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 20));
       const __m128i vk21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 524 * sizeof(int8_t))));
       i21 += 24;
 
@@ -633,15 +633,15 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 532 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 540 * sizeof(int8_t))));
-      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 16)));
+      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 16));
       const __m128i vk22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(int8_t))));
-      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 20)));
+      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 20));
       const __m128i vk22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 548 * sizeof(int8_t))));
       i22 += 24;
 
@@ -654,15 +654,15 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 556 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 564 * sizeof(int8_t))));
-      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 16)));
+      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 16));
       const __m128i vk23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(int8_t))));
-      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 20)));
+      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 20));
       const __m128i vk23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 572 * sizeof(int8_t))));
       i23 += 24;
 
@@ -675,15 +675,15 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 580 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 588 * sizeof(int8_t))));
-      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 16)));
+      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 16));
       const __m128i vk24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(int8_t))));
-      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 20)));
+      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 20));
       const __m128i vk24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 596 * sizeof(int8_t))));
       i24 += 24;
 
@@ -790,151 +790,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 216)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 264)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 312)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 360)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 408)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 432)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 456)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 480)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 504)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 528)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 552)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 576)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c
index 16c1aec..786e412 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c
@@ -176,15 +176,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -197,15 +197,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -218,15 +218,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -239,15 +239,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -260,15 +260,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -281,15 +281,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -302,15 +302,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -323,15 +323,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -344,15 +344,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -365,15 +365,15 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 220 * sizeof(int8_t))));
-      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 8)));
+      const __m128i vi9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 8));
       const __m128i vk9x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(int8_t))));
-      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 12)));
+      const __m128i vi9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 12));
       const __m128i vk9xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 228 * sizeof(int8_t))));
-      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 16)));
+      const __m128i vi9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 16));
       const __m128i vk9xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(int8_t))));
-      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 20)));
+      const __m128i vi9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 20));
       const __m128i vk9xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 236 * sizeof(int8_t))));
       i9 += 24;
 
@@ -386,15 +386,15 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 244 * sizeof(int8_t))));
-      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 8)));
+      const __m128i vi10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 8));
       const __m128i vk10x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(int8_t))));
-      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 12)));
+      const __m128i vi10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 12));
       const __m128i vk10xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 252 * sizeof(int8_t))));
-      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 16)));
+      const __m128i vi10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 16));
       const __m128i vk10xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(int8_t))));
-      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 20)));
+      const __m128i vi10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 20));
       const __m128i vk10xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 260 * sizeof(int8_t))));
       i10 += 24;
 
@@ -407,15 +407,15 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 268 * sizeof(int8_t))));
-      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 8)));
+      const __m128i vi11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 8));
       const __m128i vk11x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(int8_t))));
-      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 12)));
+      const __m128i vi11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 12));
       const __m128i vk11xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 276 * sizeof(int8_t))));
-      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 16)));
+      const __m128i vi11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 16));
       const __m128i vk11xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(int8_t))));
-      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 20)));
+      const __m128i vi11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 20));
       const __m128i vk11xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 284 * sizeof(int8_t))));
       i11 += 24;
 
@@ -428,15 +428,15 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 292 * sizeof(int8_t))));
-      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 8)));
+      const __m128i vi12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 8));
       const __m128i vk12x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(int8_t))));
-      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 12)));
+      const __m128i vi12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 12));
       const __m128i vk12xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 300 * sizeof(int8_t))));
-      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 16)));
+      const __m128i vi12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 16));
       const __m128i vk12xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(int8_t))));
-      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 20)));
+      const __m128i vi12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 20));
       const __m128i vk12xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 308 * sizeof(int8_t))));
       i12 += 24;
 
@@ -449,15 +449,15 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 316 * sizeof(int8_t))));
-      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 8)));
+      const __m128i vi13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 8));
       const __m128i vk13x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(int8_t))));
-      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 12)));
+      const __m128i vi13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 12));
       const __m128i vk13xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 324 * sizeof(int8_t))));
-      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 16)));
+      const __m128i vi13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 16));
       const __m128i vk13xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(int8_t))));
-      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 20)));
+      const __m128i vi13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 20));
       const __m128i vk13xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 332 * sizeof(int8_t))));
       i13 += 24;
 
@@ -470,15 +470,15 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 340 * sizeof(int8_t))));
-      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 8)));
+      const __m128i vi14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 8));
       const __m128i vk14x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(int8_t))));
-      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 12)));
+      const __m128i vi14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 12));
       const __m128i vk14xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 348 * sizeof(int8_t))));
-      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 16)));
+      const __m128i vi14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 16));
       const __m128i vk14xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(int8_t))));
-      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 20)));
+      const __m128i vi14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 20));
       const __m128i vk14xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 356 * sizeof(int8_t))));
       i14 += 24;
 
@@ -491,15 +491,15 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 364 * sizeof(int8_t))));
-      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 8)));
+      const __m128i vi15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 8));
       const __m128i vk15x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(int8_t))));
-      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 12)));
+      const __m128i vi15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 12));
       const __m128i vk15xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 372 * sizeof(int8_t))));
-      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 16)));
+      const __m128i vi15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 16));
       const __m128i vk15xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(int8_t))));
-      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 20)));
+      const __m128i vi15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 20));
       const __m128i vk15xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 380 * sizeof(int8_t))));
       i15 += 24;
 
@@ -512,15 +512,15 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 388 * sizeof(int8_t))));
-      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 8)));
+      const __m128i vi16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 8));
       const __m128i vk16x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(int8_t))));
-      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 12)));
+      const __m128i vi16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 12));
       const __m128i vk16xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 396 * sizeof(int8_t))));
-      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 16)));
+      const __m128i vi16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 16));
       const __m128i vk16xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(int8_t))));
-      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 20)));
+      const __m128i vi16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 20));
       const __m128i vk16xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 404 * sizeof(int8_t))));
       i16 += 24;
 
@@ -533,15 +533,15 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 412 * sizeof(int8_t))));
-      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 8)));
+      const __m128i vi17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 8));
       const __m128i vk17x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(int8_t))));
-      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 12)));
+      const __m128i vi17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 12));
       const __m128i vk17xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 420 * sizeof(int8_t))));
-      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 16)));
+      const __m128i vi17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 16));
       const __m128i vk17xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(int8_t))));
-      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 20)));
+      const __m128i vi17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 20));
       const __m128i vk17xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 428 * sizeof(int8_t))));
       i17 += 24;
 
@@ -554,15 +554,15 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 436 * sizeof(int8_t))));
-      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 8)));
+      const __m128i vi18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 8));
       const __m128i vk18x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(int8_t))));
-      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 12)));
+      const __m128i vi18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 12));
       const __m128i vk18xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 444 * sizeof(int8_t))));
-      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 16)));
+      const __m128i vi18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 16));
       const __m128i vk18xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(int8_t))));
-      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 20)));
+      const __m128i vi18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 20));
       const __m128i vk18xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 452 * sizeof(int8_t))));
       i18 += 24;
 
@@ -575,15 +575,15 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 460 * sizeof(int8_t))));
-      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 8)));
+      const __m128i vi19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 8));
       const __m128i vk19x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(int8_t))));
-      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 12)));
+      const __m128i vi19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 12));
       const __m128i vk19xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 468 * sizeof(int8_t))));
-      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 16)));
+      const __m128i vi19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 16));
       const __m128i vk19xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(int8_t))));
-      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 20)));
+      const __m128i vi19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 20));
       const __m128i vk19xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 476 * sizeof(int8_t))));
       i19 += 24;
 
@@ -596,15 +596,15 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 484 * sizeof(int8_t))));
-      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 8)));
+      const __m128i vi20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 8));
       const __m128i vk20x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(int8_t))));
-      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 12)));
+      const __m128i vi20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 12));
       const __m128i vk20xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 492 * sizeof(int8_t))));
-      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 16)));
+      const __m128i vi20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 16));
       const __m128i vk20xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(int8_t))));
-      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 20)));
+      const __m128i vi20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 20));
       const __m128i vk20xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 500 * sizeof(int8_t))));
       i20 += 24;
 
@@ -617,15 +617,15 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 508 * sizeof(int8_t))));
-      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 8)));
+      const __m128i vi21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 8));
       const __m128i vk21x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(int8_t))));
-      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 12)));
+      const __m128i vi21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 12));
       const __m128i vk21xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 516 * sizeof(int8_t))));
-      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 16)));
+      const __m128i vi21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 16));
       const __m128i vk21xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(int8_t))));
-      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 20)));
+      const __m128i vi21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 20));
       const __m128i vk21xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 524 * sizeof(int8_t))));
       i21 += 24;
 
@@ -638,15 +638,15 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 532 * sizeof(int8_t))));
-      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 8)));
+      const __m128i vi22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 8));
       const __m128i vk22x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(int8_t))));
-      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 12)));
+      const __m128i vi22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 12));
       const __m128i vk22xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 540 * sizeof(int8_t))));
-      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 16)));
+      const __m128i vi22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 16));
       const __m128i vk22xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(int8_t))));
-      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 20)));
+      const __m128i vi22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 20));
       const __m128i vk22xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 548 * sizeof(int8_t))));
       i22 += 24;
 
@@ -659,15 +659,15 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 556 * sizeof(int8_t))));
-      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 8)));
+      const __m128i vi23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 8));
       const __m128i vk23x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(int8_t))));
-      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 12)));
+      const __m128i vi23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 12));
       const __m128i vk23xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 564 * sizeof(int8_t))));
-      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 16)));
+      const __m128i vi23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 16));
       const __m128i vk23xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(int8_t))));
-      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 20)));
+      const __m128i vi23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 20));
       const __m128i vk23xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 572 * sizeof(int8_t))));
       i23 += 24;
 
@@ -680,15 +680,15 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 580 * sizeof(int8_t))));
-      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 8)));
+      const __m128i vi24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 8));
       const __m128i vk24x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(int8_t))));
-      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 12)));
+      const __m128i vi24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 12));
       const __m128i vk24xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 588 * sizeof(int8_t))));
-      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 16)));
+      const __m128i vi24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 16));
       const __m128i vk24xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(int8_t))));
-      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 20)));
+      const __m128i vi24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 20));
       const __m128i vk24xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 596 * sizeof(int8_t))));
       i24 += 24;
 
@@ -795,151 +795,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
 
         vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 216)));
         i9 += 4;
 
         vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 240)));
         i10 += 4;
 
         vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 264)));
         i11 += 4;
 
         vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 288)));
         i12 += 4;
 
         vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 312)));
         i13 += 4;
 
         vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 336)));
         i14 += 4;
 
         vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 360)));
         i15 += 4;
 
         vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 384)));
         i16 += 4;
 
         vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 408)));
         i17 += 4;
 
         vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 432)));
         i18 += 4;
 
         vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 456)));
         i19 += 4;
 
         vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 480)));
         i20 += 4;
 
         vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 504)));
         i21 += 4;
 
         vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 528)));
         i22 += 4;
 
         vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 552)));
         i23 += 4;
 
         vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 576)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
index 55bdc07..caff13c 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
@@ -91,15 +91,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -112,15 +112,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -133,15 +133,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -154,15 +154,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -175,15 +175,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -196,15 +196,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -217,15 +217,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -238,15 +238,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -259,15 +259,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -325,55 +325,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c
index 685466d..cae7a19 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c
@@ -91,15 +91,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -112,15 +112,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -133,15 +133,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -154,15 +154,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -175,15 +175,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -196,15 +196,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -217,15 +217,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -238,15 +238,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -259,15 +259,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -325,55 +325,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c
index 7da9e42..4f7df13 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c
@@ -96,15 +96,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -117,15 +117,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -138,15 +138,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -159,15 +159,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -180,15 +180,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -201,15 +201,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -222,15 +222,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -243,15 +243,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -264,15 +264,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -330,55 +330,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul32.c
index 709d965..ad86139 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul32.c
@@ -91,15 +91,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -112,15 +112,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -133,15 +133,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -154,15 +154,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -175,15 +175,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -196,15 +196,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -217,15 +217,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -238,15 +238,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -259,15 +259,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -374,55 +374,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul32.c
index 68dbff7..1b171cd 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul32.c
@@ -91,15 +91,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -112,15 +112,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -133,15 +133,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -154,15 +154,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -175,15 +175,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -196,15 +196,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -217,15 +217,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -238,15 +238,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -259,15 +259,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -374,55 +374,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-xop-mul32.c
index eea6e4d..916a838 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-xop-mul32.c
@@ -96,15 +96,15 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 4 * sizeof(int8_t))));
-      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 8)));
+      const __m128i vi0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 8));
       const __m128i vk0x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 12)));
+      const __m128i vi0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 12));
       const __m128i vk0xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 12 * sizeof(int8_t))));
-      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 16)));
+      const __m128i vi0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 16));
       const __m128i vk0xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 20)));
+      const __m128i vi0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 20));
       const __m128i vk0xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i0 += 24;
 
@@ -117,15 +117,15 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 28 * sizeof(int8_t))));
-      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 8)));
+      const __m128i vi1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 8));
       const __m128i vk1x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 12)));
+      const __m128i vi1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 12));
       const __m128i vk1xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 36 * sizeof(int8_t))));
-      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 16)));
+      const __m128i vi1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 16));
       const __m128i vk1xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 20)));
+      const __m128i vi1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 20));
       const __m128i vk1xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i1 += 24;
 
@@ -138,15 +138,15 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 52 * sizeof(int8_t))));
-      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 8)));
+      const __m128i vi2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 8));
       const __m128i vk2x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 12)));
+      const __m128i vi2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 12));
       const __m128i vk2xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 60 * sizeof(int8_t))));
-      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 16)));
+      const __m128i vi2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 16));
       const __m128i vk2xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 20)));
+      const __m128i vi2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 20));
       const __m128i vk2xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i2 += 24;
 
@@ -159,15 +159,15 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 76 * sizeof(int8_t))));
-      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 8)));
+      const __m128i vi3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 8));
       const __m128i vk3x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 12)));
+      const __m128i vi3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 12));
       const __m128i vk3xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 84 * sizeof(int8_t))));
-      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 16)));
+      const __m128i vi3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 16));
       const __m128i vk3xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 20)));
+      const __m128i vi3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 20));
       const __m128i vk3xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i3 += 24;
 
@@ -180,15 +180,15 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 100 * sizeof(int8_t))));
-      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 8)));
+      const __m128i vi4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 8));
       const __m128i vk4x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 12)));
+      const __m128i vi4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 12));
       const __m128i vk4xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 108 * sizeof(int8_t))));
-      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 16)));
+      const __m128i vi4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 16));
       const __m128i vk4xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 20)));
+      const __m128i vi4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 20));
       const __m128i vk4xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i4 += 24;
 
@@ -201,15 +201,15 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 124 * sizeof(int8_t))));
-      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 8)));
+      const __m128i vi5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 8));
       const __m128i vk5x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 12)));
+      const __m128i vi5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 12));
       const __m128i vk5xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 132 * sizeof(int8_t))));
-      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 16)));
+      const __m128i vi5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 16));
       const __m128i vk5xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 20)));
+      const __m128i vi5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 20));
       const __m128i vk5xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i5 += 24;
 
@@ -222,15 +222,15 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 148 * sizeof(int8_t))));
-      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 8)));
+      const __m128i vi6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 8));
       const __m128i vk6x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 12)));
+      const __m128i vi6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 12));
       const __m128i vk6xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 156 * sizeof(int8_t))));
-      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 16)));
+      const __m128i vi6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 16));
       const __m128i vk6xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 20)));
+      const __m128i vi6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 20));
       const __m128i vk6xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i6 += 24;
 
@@ -243,15 +243,15 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 172 * sizeof(int8_t))));
-      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 8)));
+      const __m128i vi7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 8));
       const __m128i vk7x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 12)));
+      const __m128i vi7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 12));
       const __m128i vk7xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 180 * sizeof(int8_t))));
-      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 16)));
+      const __m128i vi7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 16));
       const __m128i vk7xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 20)));
+      const __m128i vi7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 20));
       const __m128i vk7xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i7 += 24;
 
@@ -264,15 +264,15 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 196 * sizeof(int8_t))));
-      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 8)));
+      const __m128i vi8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 8));
       const __m128i vk8x89AB = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(int8_t))));
-      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 12)));
+      const __m128i vi8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 12));
       const __m128i vk8xCDEF = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 204 * sizeof(int8_t))));
-      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 16)));
+      const __m128i vi8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 16));
       const __m128i vk8xGHIJ = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(int8_t))));
-      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 20)));
+      const __m128i vi8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 20));
       const __m128i vk8xKLMN = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 212 * sizeof(int8_t))));
       i8 += 24;
 
@@ -379,55 +379,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
index 793b4d7..b7f8504 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
@@ -167,7 +167,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -176,7 +176,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -185,7 +185,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -194,7 +194,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -203,7 +203,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -212,7 +212,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -221,7 +221,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -230,7 +230,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -239,7 +239,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -248,7 +248,7 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i9 += 8;
 
@@ -257,7 +257,7 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(int8_t))));
       i10 += 8;
 
@@ -266,7 +266,7 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i11 += 8;
 
@@ -275,7 +275,7 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(int8_t))));
       i12 += 8;
 
@@ -284,7 +284,7 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i13 += 8;
 
@@ -293,7 +293,7 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i14 += 8;
 
@@ -302,7 +302,7 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i15 += 8;
 
@@ -311,7 +311,7 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(int8_t))));
       i16 += 8;
 
@@ -320,7 +320,7 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i17 += 8;
 
@@ -329,7 +329,7 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(int8_t))));
       i18 += 8;
 
@@ -338,7 +338,7 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i19 += 8;
 
@@ -347,7 +347,7 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i20 += 8;
 
@@ -356,7 +356,7 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i21 += 8;
 
@@ -365,7 +365,7 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(int8_t))));
       i22 += 8;
 
@@ -374,7 +374,7 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i23 += 8;
 
@@ -383,7 +383,7 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(int8_t))));
       i24 += 8;
 
@@ -419,151 +419,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 88)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 104)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 136)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 152)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 184)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
index 0e6e79f..e52824f 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
@@ -167,7 +167,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -176,7 +176,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -185,7 +185,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -194,7 +194,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -203,7 +203,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -212,7 +212,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -221,7 +221,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -230,7 +230,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -239,7 +239,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -248,7 +248,7 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i9 += 8;
 
@@ -257,7 +257,7 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(int8_t))));
       i10 += 8;
 
@@ -266,7 +266,7 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i11 += 8;
 
@@ -275,7 +275,7 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(int8_t))));
       i12 += 8;
 
@@ -284,7 +284,7 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i13 += 8;
 
@@ -293,7 +293,7 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i14 += 8;
 
@@ -302,7 +302,7 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i15 += 8;
 
@@ -311,7 +311,7 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(int8_t))));
       i16 += 8;
 
@@ -320,7 +320,7 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i17 += 8;
 
@@ -329,7 +329,7 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(int8_t))));
       i18 += 8;
 
@@ -338,7 +338,7 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i19 += 8;
 
@@ -347,7 +347,7 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i20 += 8;
 
@@ -356,7 +356,7 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i21 += 8;
 
@@ -365,7 +365,7 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(int8_t))));
       i22 += 8;
 
@@ -374,7 +374,7 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i23 += 8;
 
@@ -383,7 +383,7 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(int8_t))));
       i24 += 8;
 
@@ -419,151 +419,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 88)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 104)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 136)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 152)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 184)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
index a94e03d..d5174da 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
@@ -172,7 +172,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -181,7 +181,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -190,7 +190,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -199,7 +199,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -208,7 +208,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -217,7 +217,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -226,7 +226,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -235,7 +235,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -244,7 +244,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -253,7 +253,7 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i9 += 8;
 
@@ -262,7 +262,7 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(int8_t))));
       i10 += 8;
 
@@ -271,7 +271,7 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i11 += 8;
 
@@ -280,7 +280,7 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(int8_t))));
       i12 += 8;
 
@@ -289,7 +289,7 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i13 += 8;
 
@@ -298,7 +298,7 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i14 += 8;
 
@@ -307,7 +307,7 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i15 += 8;
 
@@ -316,7 +316,7 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(int8_t))));
       i16 += 8;
 
@@ -325,7 +325,7 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i17 += 8;
 
@@ -334,7 +334,7 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(int8_t))));
       i18 += 8;
 
@@ -343,7 +343,7 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i19 += 8;
 
@@ -352,7 +352,7 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i20 += 8;
 
@@ -361,7 +361,7 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i21 += 8;
 
@@ -370,7 +370,7 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(int8_t))));
       i22 += 8;
 
@@ -379,7 +379,7 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i23 += 8;
 
@@ -388,7 +388,7 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(int8_t))));
       i24 += 8;
 
@@ -424,151 +424,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
 
         vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i9 += 4;
 
         vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i10 += 4;
 
         vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 88)));
         i11 += 4;
 
         vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i12 += 4;
 
         vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 104)));
         i13 += 4;
 
         vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i14 += 4;
 
         vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i15 += 4;
 
         vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i16 += 4;
 
         vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 136)));
         i17 += 4;
 
         vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i18 += 4;
 
         vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 152)));
         i19 += 4;
 
         vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i20 += 4;
 
         vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i21 += 4;
 
         vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i22 += 4;
 
         vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 184)));
         i23 += 4;
 
         vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c
index e831243..0a996ba 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c
@@ -167,7 +167,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -176,7 +176,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -185,7 +185,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -194,7 +194,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -203,7 +203,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -212,7 +212,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -221,7 +221,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -230,7 +230,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -239,7 +239,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -248,7 +248,7 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i9 += 8;
 
@@ -257,7 +257,7 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(int8_t))));
       i10 += 8;
 
@@ -266,7 +266,7 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i11 += 8;
 
@@ -275,7 +275,7 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(int8_t))));
       i12 += 8;
 
@@ -284,7 +284,7 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i13 += 8;
 
@@ -293,7 +293,7 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i14 += 8;
 
@@ -302,7 +302,7 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i15 += 8;
 
@@ -311,7 +311,7 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(int8_t))));
       i16 += 8;
 
@@ -320,7 +320,7 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i17 += 8;
 
@@ -329,7 +329,7 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(int8_t))));
       i18 += 8;
 
@@ -338,7 +338,7 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i19 += 8;
 
@@ -347,7 +347,7 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i20 += 8;
 
@@ -356,7 +356,7 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i21 += 8;
 
@@ -365,7 +365,7 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(int8_t))));
       i22 += 8;
 
@@ -374,7 +374,7 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i23 += 8;
 
@@ -383,7 +383,7 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(int8_t))));
       i24 += 8;
 
@@ -440,151 +440,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 88)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 104)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 136)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 152)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 184)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c
index fa2fe0c..ffd093e 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c
@@ -167,7 +167,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -176,7 +176,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -185,7 +185,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -194,7 +194,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -203,7 +203,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -212,7 +212,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -221,7 +221,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -230,7 +230,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -239,7 +239,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -248,7 +248,7 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i9 += 8;
 
@@ -257,7 +257,7 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(int8_t))));
       i10 += 8;
 
@@ -266,7 +266,7 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i11 += 8;
 
@@ -275,7 +275,7 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(int8_t))));
       i12 += 8;
 
@@ -284,7 +284,7 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i13 += 8;
 
@@ -293,7 +293,7 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i14 += 8;
 
@@ -302,7 +302,7 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i15 += 8;
 
@@ -311,7 +311,7 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(int8_t))));
       i16 += 8;
 
@@ -320,7 +320,7 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i17 += 8;
 
@@ -329,7 +329,7 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(int8_t))));
       i18 += 8;
 
@@ -338,7 +338,7 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i19 += 8;
 
@@ -347,7 +347,7 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i20 += 8;
 
@@ -356,7 +356,7 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i21 += 8;
 
@@ -365,7 +365,7 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(int8_t))));
       i22 += 8;
 
@@ -374,7 +374,7 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i23 += 8;
 
@@ -383,7 +383,7 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(int8_t))));
       i24 += 8;
 
@@ -440,151 +440,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i9 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i10 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 88)));
         i11 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i12 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 104)));
         i13 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i14 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i15 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i16 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 136)));
         i17 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i18 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 152)));
         i19 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i20 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i21 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i22 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 184)));
         i23 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c
index e1b3c08..ab7522a 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c
@@ -172,7 +172,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -181,7 +181,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -190,7 +190,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -199,7 +199,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -208,7 +208,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -217,7 +217,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -226,7 +226,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -235,7 +235,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -244,7 +244,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -253,7 +253,7 @@
 
       const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
       const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t))));
-      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i9 + 4)));
+      const __m128i vi9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9 + 4));
       const __m128i vk9x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(int8_t))));
       i9 += 8;
 
@@ -262,7 +262,7 @@
 
       const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
       const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t))));
-      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i10 + 4)));
+      const __m128i vi10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10 + 4));
       const __m128i vk10x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(int8_t))));
       i10 += 8;
 
@@ -271,7 +271,7 @@
 
       const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
       const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t))));
-      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i11 + 4)));
+      const __m128i vi11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11 + 4));
       const __m128i vk11x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(int8_t))));
       i11 += 8;
 
@@ -280,7 +280,7 @@
 
       const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
       const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t))));
-      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i12 + 4)));
+      const __m128i vi12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12 + 4));
       const __m128i vk12x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(int8_t))));
       i12 += 8;
 
@@ -289,7 +289,7 @@
 
       const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
       const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t))));
-      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i13 + 4)));
+      const __m128i vi13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13 + 4));
       const __m128i vk13x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(int8_t))));
       i13 += 8;
 
@@ -298,7 +298,7 @@
 
       const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
       const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t))));
-      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i14 + 4)));
+      const __m128i vi14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14 + 4));
       const __m128i vk14x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(int8_t))));
       i14 += 8;
 
@@ -307,7 +307,7 @@
 
       const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
       const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t))));
-      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i15 + 4)));
+      const __m128i vi15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15 + 4));
       const __m128i vk15x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(int8_t))));
       i15 += 8;
 
@@ -316,7 +316,7 @@
 
       const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
       const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t))));
-      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i16 + 4)));
+      const __m128i vi16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16 + 4));
       const __m128i vk16x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(int8_t))));
       i16 += 8;
 
@@ -325,7 +325,7 @@
 
       const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
       const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t))));
-      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i17 + 4)));
+      const __m128i vi17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17 + 4));
       const __m128i vk17x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(int8_t))));
       i17 += 8;
 
@@ -334,7 +334,7 @@
 
       const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
       const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))));
-      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i18 + 4)));
+      const __m128i vi18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18 + 4));
       const __m128i vk18x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(int8_t))));
       i18 += 8;
 
@@ -343,7 +343,7 @@
 
       const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
       const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t))));
-      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i19 + 4)));
+      const __m128i vi19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19 + 4));
       const __m128i vk19x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(int8_t))));
       i19 += 8;
 
@@ -352,7 +352,7 @@
 
       const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
       const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t))));
-      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i20 + 4)));
+      const __m128i vi20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20 + 4));
       const __m128i vk20x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(int8_t))));
       i20 += 8;
 
@@ -361,7 +361,7 @@
 
       const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
       const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t))));
-      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i21 + 4)));
+      const __m128i vi21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21 + 4));
       const __m128i vk21x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(int8_t))));
       i21 += 8;
 
@@ -370,7 +370,7 @@
 
       const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
       const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t))));
-      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i22 + 4)));
+      const __m128i vi22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22 + 4));
       const __m128i vk22x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(int8_t))));
       i22 += 8;
 
@@ -379,7 +379,7 @@
 
       const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
       const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t))));
-      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i23 + 4)));
+      const __m128i vi23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23 + 4));
       const __m128i vk23x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(int8_t))));
       i23 += 8;
 
@@ -388,7 +388,7 @@
 
       const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
       const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t))));
-      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i24 + 4)));
+      const __m128i vi24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24 + 4));
       const __m128i vk24x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(int8_t))));
       i24 += 8;
 
@@ -445,151 +445,126 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
 
         vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
-
         const __m128i vi9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i9));
         const __m128i vk9x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 72)));
         i9 += 4;
 
         vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
-
         const __m128i vi10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i10));
         const __m128i vk10x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 80)));
         i10 += 4;
 
         vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
-
         const __m128i vi11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i11));
         const __m128i vk11x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 88)));
         i11 += 4;
 
         vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
-
         const __m128i vi12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i12));
         const __m128i vk12x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 96)));
         i12 += 4;
 
         vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
-
         const __m128i vi13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i13));
         const __m128i vk13x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 104)));
         i13 += 4;
 
         vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
-
         const __m128i vi14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i14));
         const __m128i vk14x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 112)));
         i14 += 4;
 
         vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
-
         const __m128i vi15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i15));
         const __m128i vk15x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 120)));
         i15 += 4;
 
         vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
-
         const __m128i vi16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i16));
         const __m128i vk16x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 128)));
         i16 += 4;
 
         vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
-
         const __m128i vi17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i17));
         const __m128i vk17x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 136)));
         i17 += 4;
 
         vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
-
         const __m128i vi18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i18));
         const __m128i vk18x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 144)));
         i18 += 4;
 
         vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
-
         const __m128i vi19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i19));
         const __m128i vk19x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 152)));
         i19 += 4;
 
         vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
-
         const __m128i vi20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i20));
         const __m128i vk20x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 160)));
         i20 += 4;
 
         vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
-
         const __m128i vi21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i21));
         const __m128i vk21x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 168)));
         i21 += 4;
 
         vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
-
         const __m128i vi22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i22));
         const __m128i vk22x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 176)));
         i22 += 4;
 
         vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
-
         const __m128i vi23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i23));
         const __m128i vk23x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 184)));
         i23 += 4;
 
         vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
-
         const __m128i vi24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i24));
         const __m128i vk24x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 192)));
         i24 += 4;
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
index 1f410d2..42feccf 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
@@ -87,7 +87,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -96,7 +96,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -105,7 +105,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -114,7 +114,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -123,7 +123,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -132,7 +132,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -141,7 +141,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -150,7 +150,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -159,7 +159,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -195,55 +195,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
index 7d38698..968e385 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
@@ -87,7 +87,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -96,7 +96,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -105,7 +105,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -114,7 +114,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -123,7 +123,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -132,7 +132,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -141,7 +141,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -150,7 +150,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -159,7 +159,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -195,55 +195,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
index 75e872a..0df9bf2 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
@@ -92,7 +92,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -101,7 +101,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -110,7 +110,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -119,7 +119,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -128,7 +128,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -137,7 +137,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -146,7 +146,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -155,7 +155,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -164,7 +164,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -200,55 +200,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c
index 64b51ef..fb42537 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c
@@ -87,7 +87,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -96,7 +96,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -105,7 +105,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -114,7 +114,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -123,7 +123,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -132,7 +132,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -141,7 +141,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -150,7 +150,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -159,7 +159,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -216,55 +216,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul32.c
index d9c43ad..5f2d77d 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul32.c
@@ -87,7 +87,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -96,7 +96,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -105,7 +105,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -114,7 +114,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -123,7 +123,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -132,7 +132,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -141,7 +141,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -150,7 +150,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -159,7 +159,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -216,55 +216,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c
index a4ea58a..cf1d6ad 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c
@@ -92,7 +92,7 @@
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
       const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t))));
-      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i0 + 4)));
+      const __m128i vi0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0 + 4));
       const __m128i vk0x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(int8_t))));
       i0 += 8;
 
@@ -101,7 +101,7 @@
 
       const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
       const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t))));
-      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i1 + 4)));
+      const __m128i vi1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1 + 4));
       const __m128i vk1x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(int8_t))));
       i1 += 8;
 
@@ -110,7 +110,7 @@
 
       const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
       const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t))));
-      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i2 + 4)));
+      const __m128i vi2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2 + 4));
       const __m128i vk2x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(int8_t))));
       i2 += 8;
 
@@ -119,7 +119,7 @@
 
       const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
       const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t))));
-      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i3 + 4)));
+      const __m128i vi3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3 + 4));
       const __m128i vk3x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(int8_t))));
       i3 += 8;
 
@@ -128,7 +128,7 @@
 
       const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
       const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t))));
-      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i4 + 4)));
+      const __m128i vi4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4 + 4));
       const __m128i vk4x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(int8_t))));
       i4 += 8;
 
@@ -137,7 +137,7 @@
 
       const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
       const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t))));
-      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i5 + 4)));
+      const __m128i vi5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5 + 4));
       const __m128i vk5x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(int8_t))));
       i5 += 8;
 
@@ -146,7 +146,7 @@
 
       const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
       const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))));
-      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i6 + 4)));
+      const __m128i vi6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6 + 4));
       const __m128i vk6x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(int8_t))));
       i6 += 8;
 
@@ -155,7 +155,7 @@
 
       const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
       const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t))));
-      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i7 + 4)));
+      const __m128i vi7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7 + 4));
       const __m128i vk7x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(int8_t))));
       i7 += 8;
 
@@ -164,7 +164,7 @@
 
       const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
       const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t))));
-      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i8 + 4)));
+      const __m128i vi8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8 + 4));
       const __m128i vk8x4567 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(int8_t))));
       i8 += 8;
 
@@ -221,55 +221,46 @@
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
-
         const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
         const __m128i vk0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
         i0 += 4;
 
         vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
-
         const __m128i vi1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i1));
         const __m128i vk1x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 8)));
         i1 += 4;
 
         vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
-
         const __m128i vi2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i2));
         const __m128i vk2x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 16)));
         i2 += 4;
 
         vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
-
         const __m128i vi3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i3));
         const __m128i vk3x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 24)));
         i3 += 4;
 
         vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
-
         const __m128i vi4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i4));
         const __m128i vk4x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 32)));
         i4 += 4;
 
         vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
-
         const __m128i vi5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i5));
         const __m128i vk5x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 40)));
         i5 += 4;
 
         vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
-
         const __m128i vi6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i6));
         const __m128i vk6x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 48)));
         i6 += 4;
 
         vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
-
         const __m128i vi7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i7));
         const __m128i vk7x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 56)));
         i7 += 4;
 
         vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
-
         const __m128i vi8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i8));
         const __m128i vk8x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + 64)));
         i8 += 4;
diff --git a/src/qs8-dwconv/unipass-sse-mul32.c.in b/src/qs8-dwconv/unipass-sse-mul32.c.in
index 9ff896c..e05ac5e 100644
--- a/src/qs8-dwconv/unipass-sse-mul32.c.in
+++ b/src/qs8-dwconv/unipass-sse-mul32.c.in
@@ -7,7 +7,7 @@
 $assert not XOP or AVX
 $assert not AVX or SSE == 4
 $assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
-$assert DATATYPE in ["QC8", "QS8"]
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
 $assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
 $assert SSE != 3 or REQUANTIZATION != "FP32"
 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
@@ -30,32 +30,35 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_qs8_conv_minmax_params"
-$PARAMS_STRUCT = "sse4" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_sse4"
+$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_%s_conv_minmax_params" % DATATYPE.lower()
+$PARAMS_STRUCT = {"QC8": "sse4", "QS8": REQUANTIZATION.lower() + "_sse4", "QU8": REQUANTIZATION.lower() + "_sse2"}[DATATYPE]
 $ISA = "xop" if XOP else "avx" if AVX else {4: "sse41"}[SSE]
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
 void xnn_${DATATYPE.lower()}_dwconv_minmax_${REQUANTIZATION.lower()}_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${ISA}_mul32(
     size_t channels,
     size_t output_width,
-    const int8_t** input,
+    const ${XINT8_T}** input,
     const void* weights,
-    int8_t* output,
+    ${XINT8_T}* output,
     size_t input_stride,
     size_t output_increment,
     size_t input_offset,
-    const int8_t* zero,
+    const ${XINT8_T}* zero,
     const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
 {
   assert(channels != 0);
   assert(output_width != 0);
 
+  $if DATATYPE == "QU8":
+    const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->${PARAMS_STRUCT}.kernel_zero_point));
   do {
     $for K in range(KERNEL_TILE):
-      const int8_t* i${K} = input[${K}];
+      const ${XINT8_T}* i${K} = input[${K}];
       assert(i${K} != NULL);
       if XNN_UNPREDICTABLE(i${K} != zero) {
-        i${K} = (const int8_t*) ((uintptr_t) i${K} + input_offset);
+        i${K} = (const ${XINT8_T}*) ((uintptr_t) i${K} + input_offset);
       }
-    input = (const int8_t**) ((uintptr_t) input + input_stride);
+    input = (const ${XINT8_T}**) ((uintptr_t) input + input_stride);
 
     size_t c = channels;
     const void* w = weights;
@@ -67,11 +70,18 @@
       $for K in range(KERNEL_TILE):
 
         $for C in range(0, CHANNEL_TILE, 4):
-          $if C == 0:
-            const __m128i vi${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(i${K}));
+          $if DATATYPE == "QU8":
+            $if C == 0:
+              const __m128i vi${K}x${ABC[0:4]} = _mm_cvtepu8_epi32(_mm_loadu_si32(i${K}));
+            $else:
+              const __m128i vi${K}x${ABC[C:C+4]} = _mm_cvtepu8_epi32(_mm_loadu_si32(i${K} + ${C}));
+            const __m128i vk${K}x${ABC[C:C+4]} = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE + C} * sizeof(${XINT8_T})))), vk_zero_point);
           $else:
-            const __m128i vi${K}x${ABC[C:C+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (i${K} + ${C})));
-          const __m128i vk${K}x${ABC[C:C+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE + C} * sizeof(int8_t))));
+            $if C == 0:
+              const __m128i vi${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(i${K}));
+            $else:
+              const __m128i vi${K}x${ABC[C:C+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(i${K} + ${C}));
+            const __m128i vk${K}x${ABC[C:C+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE + C} * sizeof(${XINT8_T}))));
         i${K} += ${CHANNEL_TILE};
 
         $for C in range(0, CHANNEL_TILE, 4):
@@ -80,7 +90,7 @@
           $else:
             vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_mullo_epi32(vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]}));
 
-      w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(int8_t));
+      w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(${XINT8_T}));
 
       $if REQUANTIZATION == "GEMMLOWP":
         const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.multiplier);
@@ -133,15 +143,26 @@
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
-      $for C in range(0, CHANNEL_TILE, 16):
-        $if C + 8 < CHANNEL_TILE:
-          __m128i vout${ABC[C:C+16]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
-          vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min);
-          vout${ABC[C:C+16]} = _mm_min_epi8(vout${ABC[C:C+16]}, voutput_max);
-        $else:
-          __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
-          vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
-          vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_min_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
+      $if DATATYPE == "QU8":
+        $for C in range(0, CHANNEL_TILE, 16):
+          $if C + 8 < CHANNEL_TILE:
+            __m128i vout${ABC[C:C+16]} = _mm_packus_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
+            vout${ABC[C:C+16]} = _mm_max_epu8(vout${ABC[C:C+16]}, voutput_min);
+            vout${ABC[C:C+16]} = _mm_min_epu8(vout${ABC[C:C+16]}, voutput_max);
+          $else:
+            __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packus_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+            vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epu8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
+            vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_min_epu8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
+      $else:
+        $for C in range(0, CHANNEL_TILE, 16):
+          $if C + 8 < CHANNEL_TILE:
+            __m128i vout${ABC[C:C+16]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
+            vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min);
+            vout${ABC[C:C+16]} = _mm_min_epi8(vout${ABC[C:C+16]}, voutput_max);
+          $else:
+            __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+            vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
+            vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_min_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
 
       $if CHANNEL_TILE > 8:
         _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
@@ -156,20 +177,29 @@
     }
     if XNN_UNLIKELY(c != 0) {
       $if CHANNEL_TILE > 4:
-        const int8_t* k = (const int8_t*) ((const int32_t*) w + ${CHANNEL_TILE});
+        const ${XINT8_T}* k = (const ${XINT8_T}*) ((const int32_t*) w + ${CHANNEL_TILE});
       ${"do " if CHANNEL_TILE > 4 else ""}{
         __m128i vacc${ABC[0:4]} = _mm_loadu_si128((const __m128i*) w);
 
         $for K in range(KERNEL_TILE):
-
-          const __m128i vi${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(i${K}));
-          $if CHANNEL_TILE > 4:
-            $if K == 0:
-              const __m128i vk${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
+          $if DATATYPE == "QU8":
+            const __m128i vi${K}x${ABC[0:4]} = _mm_cvtepu8_epi32(_mm_loadu_si32(i${K}));
+            $if CHANNEL_TILE > 4:
+              $if K == 0:
+                const __m128i vk${K}x${ABC[0:4]} = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+              $else:
+                const __m128i vk${K}x${ABC[0:4]} = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + ${K * CHANNEL_TILE}))), vk_zero_point);
             $else:
-              const __m128i vk${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + ${K * CHANNEL_TILE})));
+              const __m128i vk${K}x${ABC[0:4]} = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE} * sizeof(${XINT8_T})))), vk_zero_point);
           $else:
-            const __m128i vk${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE} * sizeof(int8_t))));
+            const __m128i vi${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(i${K}));
+            $if CHANNEL_TILE > 4:
+              $if K == 0:
+                const __m128i vk${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(k));
+              $else:
+                const __m128i vk${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) (k + ${K * CHANNEL_TILE})));
+            $else:
+              const __m128i vk${K}x${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${K * CHANNEL_TILE} * sizeof(${XINT8_T}))));
           $if CHANNEL_TILE > 4:
             i${K} += 4;
 
@@ -206,7 +236,7 @@
         $elif REQUANTIZATION == "FP32":
           __m128 vscaled${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]});
           $if DATATYPE == "QC8":
-            const __m128 vscale${ABC[0:4]} = _mm_loadu_ps((const float*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${CHANNEL_TILE * KERNEL_TILE} * sizeof(int8_t)));
+            const __m128 vscale${ABC[0:4]} = _mm_loadu_ps((const float*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${CHANNEL_TILE * KERNEL_TILE} * sizeof(${XINT8_T})));
             vscaled${ABC[0:4]} = _mm_mul_ps(vscaled${ABC[0:4]}, vscale${ABC[0:4]});
           $else:
             vscaled${ABC[0:4]} = _mm_mul_ps(vscaled${ABC[0:4]}, _mm_load_ps(params->${PARAMS_STRUCT}.scale));
@@ -218,9 +248,14 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
         __m128i vout${ABC[0:4]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[0:4]}), voutput_zero_point);
 
-        vout${ABC[0:4]} = _mm_packs_epi16(vout${ABC[0:4]}, vout${ABC[0:4]});
-        vout${ABC[0:4]} = _mm_max_epi8(vout${ABC[0:4]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
-        vout${ABC[0:4]} = _mm_min_epi8(vout${ABC[0:4]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
+        $if DATATYPE == "QU8":
+          vout${ABC[0:4]} = _mm_packus_epi16(vout${ABC[0:4]}, vout${ABC[0:4]});
+          vout${ABC[0:4]} = _mm_max_epu8(vout${ABC[0:4]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+          vout${ABC[0:4]} = _mm_min_epu8(vout${ABC[0:4]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
+        $else:
+          vout${ABC[0:4]} = _mm_packs_epi16(vout${ABC[0:4]}, vout${ABC[0:4]});
+          vout${ABC[0:4]} = _mm_max_epi8(vout${ABC[0:4]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+          vout${ABC[0:4]} = _mm_min_epi8(vout${ABC[0:4]}, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
 
         $if CHANNEL_TILE > 4:
           if XNN_LIKELY(c >= 4) {
@@ -234,7 +269,7 @@
               output += 2;
             }
             if (c & 1) {
-              *output = (int8_t) _mm_extract_epi8(vout${ABC[0:4]}, 0);
+              *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:4]}, 0);
               output += 1;
             }
             c = 0;
@@ -246,12 +281,12 @@
             output += 2;
           }
           if (c & 1) {
-            *output = (int8_t) _mm_extract_epi8(vout${ABC[0:4]}, 0);
+            *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:4]}, 0);
             output += 1;
           }
       }${" while (c != 0);" if CHANNEL_TILE > 4 else ""}
     }
 
-    output = (int8_t*) ((uintptr_t) output + output_increment);
+    output = (${XINT8_T}*) ((uintptr_t) output + output_increment);
   } while (--output_width != 0);
 }
diff --git a/src/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c b/src/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
new file mode 100644
index 0000000..e017cb1
--- /dev/null
+++ b/src/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
@@ -0,0 +1,744 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    const uint8_t* i9 = input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != zero) {
+      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
+    }
+    const uint8_t* i10 = input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != zero) {
+      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
+    }
+    const uint8_t* i11 = input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != zero) {
+      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
+    }
+    const uint8_t* i12 = input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != zero) {
+      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
+    }
+    const uint8_t* i13 = input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != zero) {
+      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
+    }
+    const uint8_t* i14 = input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != zero) {
+      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
+    }
+    const uint8_t* i15 = input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != zero) {
+      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
+    }
+    const uint8_t* i16 = input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != zero) {
+      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
+    }
+    const uint8_t* i17 = input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != zero) {
+      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
+    }
+    const uint8_t* i18 = input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != zero) {
+      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
+    }
+    const uint8_t* i19 = input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != zero) {
+      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
+    }
+    const uint8_t* i20 = input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != zero) {
+      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
+    }
+    const uint8_t* i21 = input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != zero) {
+      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
+    }
+    const uint8_t* i22 = input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != zero) {
+      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
+    }
+    const uint8_t* i23 = input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != zero) {
+      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
+    }
+    const uint8_t* i24 = input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != zero) {
+      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 16; c -= 16) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+      __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
+      __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 8));
+      const __m128i vk0x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 12));
+      const __m128i vk0xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi0x4567, vk0x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi0x89AB, vk0x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi0xCDEF, vk0xCDEF));
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 8));
+      const __m128i vk1x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 12));
+      const __m128i vk1xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi1x4567, vk1x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi1x89AB, vk1x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi1xCDEF, vk1xCDEF));
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 8));
+      const __m128i vk2x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 12));
+      const __m128i vk2xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi2x4567, vk2x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi2x89AB, vk2x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi2xCDEF, vk2xCDEF));
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 8));
+      const __m128i vk3x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 12));
+      const __m128i vk3xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi3x4567, vk3x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi3x89AB, vk3x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi3xCDEF, vk3xCDEF));
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 8));
+      const __m128i vk4x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 12));
+      const __m128i vk4xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi4x4567, vk4x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi4x89AB, vk4x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi4xCDEF, vk4xCDEF));
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 8));
+      const __m128i vk5x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 12));
+      const __m128i vk5xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi5x4567, vk5x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi5x89AB, vk5x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi5xCDEF, vk5xCDEF));
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 8));
+      const __m128i vk6x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 12));
+      const __m128i vk6xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi6x4567, vk6x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi6x89AB, vk6x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi6xCDEF, vk6xCDEF));
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 8));
+      const __m128i vk7x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 12));
+      const __m128i vk7xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi7x4567, vk7x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi7x89AB, vk7x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi7xCDEF, vk7xCDEF));
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 8));
+      const __m128i vk8x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 12));
+      const __m128i vk8xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi8x4567, vk8x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi8x89AB, vk8x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi8xCDEF, vk8xCDEF));
+
+      const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+      const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 4));
+      const __m128i vk9x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 8));
+      const __m128i vk9x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 12));
+      const __m128i vk9xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(uint8_t)))), vk_zero_point);
+      i9 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi9x4567, vk9x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi9x89AB, vk9x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi9xCDEF, vk9xCDEF));
+
+      const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+      const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 4));
+      const __m128i vk10x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 8));
+      const __m128i vk10x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 12));
+      const __m128i vk10xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(uint8_t)))), vk_zero_point);
+      i10 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi10x4567, vk10x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi10x89AB, vk10x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi10xCDEF, vk10xCDEF));
+
+      const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+      const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 4));
+      const __m128i vk11x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 8));
+      const __m128i vk11x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 12));
+      const __m128i vk11xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(uint8_t)))), vk_zero_point);
+      i11 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi11x4567, vk11x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi11x89AB, vk11x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi11xCDEF, vk11xCDEF));
+
+      const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+      const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 4));
+      const __m128i vk12x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 8));
+      const __m128i vk12x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 12));
+      const __m128i vk12xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(uint8_t)))), vk_zero_point);
+      i12 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi12x4567, vk12x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi12x89AB, vk12x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi12xCDEF, vk12xCDEF));
+
+      const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+      const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 4));
+      const __m128i vk13x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 8));
+      const __m128i vk13x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 12));
+      const __m128i vk13xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(uint8_t)))), vk_zero_point);
+      i13 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi13x4567, vk13x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi13x89AB, vk13x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi13xCDEF, vk13xCDEF));
+
+      const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+      const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 4));
+      const __m128i vk14x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 8));
+      const __m128i vk14x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 12));
+      const __m128i vk14xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(uint8_t)))), vk_zero_point);
+      i14 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi14x4567, vk14x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi14x89AB, vk14x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi14xCDEF, vk14xCDEF));
+
+      const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+      const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 4));
+      const __m128i vk15x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 8));
+      const __m128i vk15x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 12));
+      const __m128i vk15xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(uint8_t)))), vk_zero_point);
+      i15 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi15x4567, vk15x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi15x89AB, vk15x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi15xCDEF, vk15xCDEF));
+
+      const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+      const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 4));
+      const __m128i vk16x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 8));
+      const __m128i vk16x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 12));
+      const __m128i vk16xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(uint8_t)))), vk_zero_point);
+      i16 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi16x4567, vk16x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi16x89AB, vk16x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi16xCDEF, vk16xCDEF));
+
+      const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+      const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 4));
+      const __m128i vk17x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 8));
+      const __m128i vk17x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 12));
+      const __m128i vk17xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(uint8_t)))), vk_zero_point);
+      i17 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi17x4567, vk17x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi17x89AB, vk17x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi17xCDEF, vk17xCDEF));
+
+      const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+      const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 4));
+      const __m128i vk18x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 8));
+      const __m128i vk18x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 12));
+      const __m128i vk18xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(uint8_t)))), vk_zero_point);
+      i18 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi18x4567, vk18x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi18x89AB, vk18x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi18xCDEF, vk18xCDEF));
+
+      const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+      const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 4));
+      const __m128i vk19x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 8));
+      const __m128i vk19x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 12));
+      const __m128i vk19xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(uint8_t)))), vk_zero_point);
+      i19 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi19x4567, vk19x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi19x89AB, vk19x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi19xCDEF, vk19xCDEF));
+
+      const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+      const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 4));
+      const __m128i vk20x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 8));
+      const __m128i vk20x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 12));
+      const __m128i vk20xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(uint8_t)))), vk_zero_point);
+      i20 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi20x4567, vk20x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi20x89AB, vk20x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi20xCDEF, vk20xCDEF));
+
+      const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+      const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 4));
+      const __m128i vk21x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 8));
+      const __m128i vk21x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 12));
+      const __m128i vk21xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(uint8_t)))), vk_zero_point);
+      i21 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi21x4567, vk21x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi21x89AB, vk21x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi21xCDEF, vk21xCDEF));
+
+      const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+      const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 4));
+      const __m128i vk22x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 8));
+      const __m128i vk22x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 12));
+      const __m128i vk22xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(uint8_t)))), vk_zero_point);
+      i22 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi22x4567, vk22x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi22x89AB, vk22x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi22xCDEF, vk22xCDEF));
+
+      const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+      const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 4));
+      const __m128i vk23x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 8));
+      const __m128i vk23x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 12));
+      const __m128i vk23xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(uint8_t)))), vk_zero_point);
+      i23 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi23x4567, vk23x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi23x89AB, vk23x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi23xCDEF, vk23xCDEF));
+
+      const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+      const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 4));
+      const __m128i vk24x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 8));
+      const __m128i vk24x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 12));
+      const __m128i vk24xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(uint8_t)))), vk_zero_point);
+      i24 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi24x0123, vk24x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi24x4567, vk24x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi24x89AB, vk24x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi24xCDEF, vk24xCDEF));
+
+      w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+      __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
+      __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+      vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
+      vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+      vacc89AB = _mm_cvtps_epi32(vscaled89AB);
+      vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+      __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
+
+      _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+      output += 16;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 80))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 96))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 112))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 128))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+        const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+        const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 144))), vk_zero_point);
+        i9 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
+        const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+        const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 160))), vk_zero_point);
+        i10 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
+        const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+        const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 176))), vk_zero_point);
+        i11 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
+        const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+        const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 192))), vk_zero_point);
+        i12 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
+        const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+        const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 208))), vk_zero_point);
+        i13 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
+        const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+        const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 224))), vk_zero_point);
+        i14 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
+        const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+        const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 240))), vk_zero_point);
+        i15 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
+        const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+        const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 256))), vk_zero_point);
+        i16 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
+        const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+        const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 272))), vk_zero_point);
+        i17 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
+        const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+        const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 288))), vk_zero_point);
+        i18 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
+        const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+        const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 304))), vk_zero_point);
+        i19 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
+        const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+        const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 320))), vk_zero_point);
+        i20 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
+        const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+        const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 336))), vk_zero_point);
+        i21 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
+        const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+        const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 352))), vk_zero_point);
+        i22 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
+        const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+        const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 368))), vk_zero_point);
+        i23 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
+        const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+        const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 384))), vk_zero_point);
+        i24 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi24x0123, vk24x0123));
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c b/src/qu8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
new file mode 100644
index 0000000..6a4c28d
--- /dev/null
+++ b/src/qu8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
@@ -0,0 +1,744 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    const uint8_t* i9 = input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != zero) {
+      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
+    }
+    const uint8_t* i10 = input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != zero) {
+      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
+    }
+    const uint8_t* i11 = input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != zero) {
+      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
+    }
+    const uint8_t* i12 = input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != zero) {
+      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
+    }
+    const uint8_t* i13 = input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != zero) {
+      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
+    }
+    const uint8_t* i14 = input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != zero) {
+      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
+    }
+    const uint8_t* i15 = input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != zero) {
+      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
+    }
+    const uint8_t* i16 = input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != zero) {
+      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
+    }
+    const uint8_t* i17 = input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != zero) {
+      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
+    }
+    const uint8_t* i18 = input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != zero) {
+      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
+    }
+    const uint8_t* i19 = input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != zero) {
+      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
+    }
+    const uint8_t* i20 = input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != zero) {
+      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
+    }
+    const uint8_t* i21 = input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != zero) {
+      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
+    }
+    const uint8_t* i22 = input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != zero) {
+      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
+    }
+    const uint8_t* i23 = input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != zero) {
+      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
+    }
+    const uint8_t* i24 = input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != zero) {
+      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 16; c -= 16) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+      __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
+      __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 8));
+      const __m128i vk0x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 12));
+      const __m128i vk0xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi0x4567, vk0x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi0x89AB, vk0x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi0xCDEF, vk0xCDEF));
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 8));
+      const __m128i vk1x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 12));
+      const __m128i vk1xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi1x4567, vk1x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi1x89AB, vk1x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi1xCDEF, vk1xCDEF));
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 8));
+      const __m128i vk2x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 12));
+      const __m128i vk2xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi2x4567, vk2x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi2x89AB, vk2x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi2xCDEF, vk2xCDEF));
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 8));
+      const __m128i vk3x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 12));
+      const __m128i vk3xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi3x4567, vk3x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi3x89AB, vk3x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi3xCDEF, vk3xCDEF));
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 8));
+      const __m128i vk4x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 12));
+      const __m128i vk4xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi4x4567, vk4x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi4x89AB, vk4x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi4xCDEF, vk4xCDEF));
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 8));
+      const __m128i vk5x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 12));
+      const __m128i vk5xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi5x4567, vk5x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi5x89AB, vk5x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi5xCDEF, vk5xCDEF));
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 8));
+      const __m128i vk6x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 12));
+      const __m128i vk6xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi6x4567, vk6x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi6x89AB, vk6x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi6xCDEF, vk6xCDEF));
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 8));
+      const __m128i vk7x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 12));
+      const __m128i vk7xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi7x4567, vk7x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi7x89AB, vk7x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi7xCDEF, vk7xCDEF));
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 8));
+      const __m128i vk8x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 12));
+      const __m128i vk8xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi8x4567, vk8x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi8x89AB, vk8x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi8xCDEF, vk8xCDEF));
+
+      const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+      const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 4));
+      const __m128i vk9x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 8));
+      const __m128i vk9x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 12));
+      const __m128i vk9xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(uint8_t)))), vk_zero_point);
+      i9 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi9x4567, vk9x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi9x89AB, vk9x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi9xCDEF, vk9xCDEF));
+
+      const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+      const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 4));
+      const __m128i vk10x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 8));
+      const __m128i vk10x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 12));
+      const __m128i vk10xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(uint8_t)))), vk_zero_point);
+      i10 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi10x4567, vk10x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi10x89AB, vk10x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi10xCDEF, vk10xCDEF));
+
+      const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+      const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 4));
+      const __m128i vk11x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 8));
+      const __m128i vk11x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 12));
+      const __m128i vk11xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(uint8_t)))), vk_zero_point);
+      i11 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi11x4567, vk11x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi11x89AB, vk11x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi11xCDEF, vk11xCDEF));
+
+      const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+      const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 4));
+      const __m128i vk12x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 8));
+      const __m128i vk12x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 12));
+      const __m128i vk12xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(uint8_t)))), vk_zero_point);
+      i12 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi12x4567, vk12x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi12x89AB, vk12x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi12xCDEF, vk12xCDEF));
+
+      const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+      const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 4));
+      const __m128i vk13x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 8));
+      const __m128i vk13x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 12));
+      const __m128i vk13xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(uint8_t)))), vk_zero_point);
+      i13 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi13x4567, vk13x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi13x89AB, vk13x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi13xCDEF, vk13xCDEF));
+
+      const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+      const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 4));
+      const __m128i vk14x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 8));
+      const __m128i vk14x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 12));
+      const __m128i vk14xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(uint8_t)))), vk_zero_point);
+      i14 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi14x4567, vk14x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi14x89AB, vk14x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi14xCDEF, vk14xCDEF));
+
+      const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+      const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 4));
+      const __m128i vk15x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 8));
+      const __m128i vk15x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 12));
+      const __m128i vk15xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(uint8_t)))), vk_zero_point);
+      i15 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi15x4567, vk15x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi15x89AB, vk15x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi15xCDEF, vk15xCDEF));
+
+      const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+      const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 4));
+      const __m128i vk16x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 8));
+      const __m128i vk16x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 12));
+      const __m128i vk16xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(uint8_t)))), vk_zero_point);
+      i16 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi16x4567, vk16x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi16x89AB, vk16x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi16xCDEF, vk16xCDEF));
+
+      const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+      const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 4));
+      const __m128i vk17x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 8));
+      const __m128i vk17x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 12));
+      const __m128i vk17xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(uint8_t)))), vk_zero_point);
+      i17 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi17x4567, vk17x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi17x89AB, vk17x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi17xCDEF, vk17xCDEF));
+
+      const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+      const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 4));
+      const __m128i vk18x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 8));
+      const __m128i vk18x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 12));
+      const __m128i vk18xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(uint8_t)))), vk_zero_point);
+      i18 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi18x4567, vk18x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi18x89AB, vk18x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi18xCDEF, vk18xCDEF));
+
+      const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+      const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 4));
+      const __m128i vk19x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 8));
+      const __m128i vk19x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 12));
+      const __m128i vk19xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(uint8_t)))), vk_zero_point);
+      i19 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi19x4567, vk19x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi19x89AB, vk19x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi19xCDEF, vk19xCDEF));
+
+      const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+      const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 4));
+      const __m128i vk20x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 8));
+      const __m128i vk20x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 12));
+      const __m128i vk20xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(uint8_t)))), vk_zero_point);
+      i20 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi20x4567, vk20x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi20x89AB, vk20x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi20xCDEF, vk20xCDEF));
+
+      const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+      const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 4));
+      const __m128i vk21x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 8));
+      const __m128i vk21x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 12));
+      const __m128i vk21xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(uint8_t)))), vk_zero_point);
+      i21 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi21x4567, vk21x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi21x89AB, vk21x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi21xCDEF, vk21xCDEF));
+
+      const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+      const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 4));
+      const __m128i vk22x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 8));
+      const __m128i vk22x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 12));
+      const __m128i vk22xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(uint8_t)))), vk_zero_point);
+      i22 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi22x4567, vk22x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi22x89AB, vk22x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi22xCDEF, vk22xCDEF));
+
+      const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+      const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 4));
+      const __m128i vk23x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 8));
+      const __m128i vk23x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 12));
+      const __m128i vk23xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(uint8_t)))), vk_zero_point);
+      i23 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi23x4567, vk23x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi23x89AB, vk23x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi23xCDEF, vk23xCDEF));
+
+      const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+      const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 4));
+      const __m128i vk24x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 8));
+      const __m128i vk24x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 12));
+      const __m128i vk24xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(uint8_t)))), vk_zero_point);
+      i24 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi24x0123, vk24x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi24x4567, vk24x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi24x89AB, vk24x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi24xCDEF, vk24xCDEF));
+
+      w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+      __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
+      __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+      vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
+      vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+      vacc89AB = _mm_cvtps_epi32(vscaled89AB);
+      vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+      __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
+
+      _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+      output += 16;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 80))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 96))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 112))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 128))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+        const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+        const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 144))), vk_zero_point);
+        i9 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
+        const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+        const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 160))), vk_zero_point);
+        i10 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
+        const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+        const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 176))), vk_zero_point);
+        i11 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
+        const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+        const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 192))), vk_zero_point);
+        i12 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
+        const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+        const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 208))), vk_zero_point);
+        i13 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
+        const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+        const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 224))), vk_zero_point);
+        i14 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
+        const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+        const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 240))), vk_zero_point);
+        i15 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
+        const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+        const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 256))), vk_zero_point);
+        i16 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
+        const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+        const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 272))), vk_zero_point);
+        i17 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
+        const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+        const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 288))), vk_zero_point);
+        i18 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
+        const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+        const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 304))), vk_zero_point);
+        i19 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
+        const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+        const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 320))), vk_zero_point);
+        i20 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
+        const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+        const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 336))), vk_zero_point);
+        i21 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
+        const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+        const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 352))), vk_zero_point);
+        i22 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
+        const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+        const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 368))), vk_zero_point);
+        i23 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
+        const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+        const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 384))), vk_zero_point);
+        i24 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi24x0123, vk24x0123));
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c b/src/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
new file mode 100644
index 0000000..718412a
--- /dev/null
+++ b/src/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
@@ -0,0 +1,749 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    const uint8_t* i9 = input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != zero) {
+      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
+    }
+    const uint8_t* i10 = input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != zero) {
+      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
+    }
+    const uint8_t* i11 = input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != zero) {
+      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
+    }
+    const uint8_t* i12 = input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != zero) {
+      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
+    }
+    const uint8_t* i13 = input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != zero) {
+      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
+    }
+    const uint8_t* i14 = input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != zero) {
+      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
+    }
+    const uint8_t* i15 = input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != zero) {
+      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
+    }
+    const uint8_t* i16 = input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != zero) {
+      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
+    }
+    const uint8_t* i17 = input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != zero) {
+      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
+    }
+    const uint8_t* i18 = input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != zero) {
+      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
+    }
+    const uint8_t* i19 = input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != zero) {
+      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
+    }
+    const uint8_t* i20 = input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != zero) {
+      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
+    }
+    const uint8_t* i21 = input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != zero) {
+      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
+    }
+    const uint8_t* i22 = input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != zero) {
+      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
+    }
+    const uint8_t* i23 = input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != zero) {
+      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
+    }
+    const uint8_t* i24 = input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != zero) {
+      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 16; c -= 16) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+      __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
+      __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 8));
+      const __m128i vk0x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 12));
+      const __m128i vk0xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi0x4567, vk0x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi0x89AB, vk0x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi0xCDEF, vk0xCDEF, vaccCDEF);
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 8));
+      const __m128i vk1x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 12));
+      const __m128i vk1xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi1x4567, vk1x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi1x89AB, vk1x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi1xCDEF, vk1xCDEF, vaccCDEF);
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 8));
+      const __m128i vk2x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 12));
+      const __m128i vk2xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi2x4567, vk2x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi2x89AB, vk2x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi2xCDEF, vk2xCDEF, vaccCDEF);
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 8));
+      const __m128i vk3x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 12));
+      const __m128i vk3xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi3x4567, vk3x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi3x89AB, vk3x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi3xCDEF, vk3xCDEF, vaccCDEF);
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 8));
+      const __m128i vk4x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 12));
+      const __m128i vk4xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi4x4567, vk4x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi4x89AB, vk4x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi4xCDEF, vk4xCDEF, vaccCDEF);
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 8));
+      const __m128i vk5x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 12));
+      const __m128i vk5xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi5x4567, vk5x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi5x89AB, vk5x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi5xCDEF, vk5xCDEF, vaccCDEF);
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 8));
+      const __m128i vk6x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 12));
+      const __m128i vk6xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi6x4567, vk6x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi6x89AB, vk6x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi6xCDEF, vk6xCDEF, vaccCDEF);
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 8));
+      const __m128i vk7x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 12));
+      const __m128i vk7xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi7x4567, vk7x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi7x89AB, vk7x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi7xCDEF, vk7xCDEF, vaccCDEF);
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 8));
+      const __m128i vk8x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 12));
+      const __m128i vk8xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi8x4567, vk8x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi8x89AB, vk8x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi8xCDEF, vk8xCDEF, vaccCDEF);
+
+      const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+      const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 4));
+      const __m128i vk9x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 148 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 8));
+      const __m128i vk9x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 12));
+      const __m128i vk9xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 156 * sizeof(uint8_t)))), vk_zero_point);
+      i9 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi9x4567, vk9x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi9x89AB, vk9x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi9xCDEF, vk9xCDEF, vaccCDEF);
+
+      const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+      const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 4));
+      const __m128i vk10x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 164 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 8));
+      const __m128i vk10x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 12));
+      const __m128i vk10xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 172 * sizeof(uint8_t)))), vk_zero_point);
+      i10 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi10x4567, vk10x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi10x89AB, vk10x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi10xCDEF, vk10xCDEF, vaccCDEF);
+
+      const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+      const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 4));
+      const __m128i vk11x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 180 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 8));
+      const __m128i vk11x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 12));
+      const __m128i vk11xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 188 * sizeof(uint8_t)))), vk_zero_point);
+      i11 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi11x4567, vk11x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi11x89AB, vk11x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi11xCDEF, vk11xCDEF, vaccCDEF);
+
+      const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+      const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 4));
+      const __m128i vk12x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 196 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 8));
+      const __m128i vk12x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 12));
+      const __m128i vk12xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 204 * sizeof(uint8_t)))), vk_zero_point);
+      i12 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi12x4567, vk12x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi12x89AB, vk12x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi12xCDEF, vk12xCDEF, vaccCDEF);
+
+      const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+      const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 4));
+      const __m128i vk13x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 212 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 8));
+      const __m128i vk13x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 12));
+      const __m128i vk13xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 220 * sizeof(uint8_t)))), vk_zero_point);
+      i13 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi13x4567, vk13x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi13x89AB, vk13x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi13xCDEF, vk13xCDEF, vaccCDEF);
+
+      const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+      const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 4));
+      const __m128i vk14x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 228 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 8));
+      const __m128i vk14x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 12));
+      const __m128i vk14xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 236 * sizeof(uint8_t)))), vk_zero_point);
+      i14 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi14x4567, vk14x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi14x89AB, vk14x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi14xCDEF, vk14xCDEF, vaccCDEF);
+
+      const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+      const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 4));
+      const __m128i vk15x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 244 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 8));
+      const __m128i vk15x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 12));
+      const __m128i vk15xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 252 * sizeof(uint8_t)))), vk_zero_point);
+      i15 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi15x4567, vk15x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi15x89AB, vk15x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi15xCDEF, vk15xCDEF, vaccCDEF);
+
+      const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+      const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 4));
+      const __m128i vk16x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 260 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 8));
+      const __m128i vk16x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 12));
+      const __m128i vk16xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 268 * sizeof(uint8_t)))), vk_zero_point);
+      i16 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi16x4567, vk16x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi16x89AB, vk16x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi16xCDEF, vk16xCDEF, vaccCDEF);
+
+      const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+      const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 4));
+      const __m128i vk17x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 276 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 8));
+      const __m128i vk17x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 12));
+      const __m128i vk17xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 284 * sizeof(uint8_t)))), vk_zero_point);
+      i17 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi17x4567, vk17x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi17x89AB, vk17x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi17xCDEF, vk17xCDEF, vaccCDEF);
+
+      const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+      const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 4));
+      const __m128i vk18x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 292 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 8));
+      const __m128i vk18x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 12));
+      const __m128i vk18xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 300 * sizeof(uint8_t)))), vk_zero_point);
+      i18 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi18x4567, vk18x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi18x89AB, vk18x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi18xCDEF, vk18xCDEF, vaccCDEF);
+
+      const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+      const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 4));
+      const __m128i vk19x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 308 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 8));
+      const __m128i vk19x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 12));
+      const __m128i vk19xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 316 * sizeof(uint8_t)))), vk_zero_point);
+      i19 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi19x4567, vk19x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi19x89AB, vk19x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi19xCDEF, vk19xCDEF, vaccCDEF);
+
+      const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+      const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 4));
+      const __m128i vk20x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 324 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 8));
+      const __m128i vk20x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 12));
+      const __m128i vk20xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 332 * sizeof(uint8_t)))), vk_zero_point);
+      i20 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi20x4567, vk20x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi20x89AB, vk20x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi20xCDEF, vk20xCDEF, vaccCDEF);
+
+      const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+      const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 4));
+      const __m128i vk21x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 340 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 8));
+      const __m128i vk21x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 12));
+      const __m128i vk21xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 348 * sizeof(uint8_t)))), vk_zero_point);
+      i21 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi21x4567, vk21x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi21x89AB, vk21x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi21xCDEF, vk21xCDEF, vaccCDEF);
+
+      const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+      const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 4));
+      const __m128i vk22x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 356 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 8));
+      const __m128i vk22x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 12));
+      const __m128i vk22xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 364 * sizeof(uint8_t)))), vk_zero_point);
+      i22 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi22x4567, vk22x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi22x89AB, vk22x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi22xCDEF, vk22xCDEF, vaccCDEF);
+
+      const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+      const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 4));
+      const __m128i vk23x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 372 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 8));
+      const __m128i vk23x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 12));
+      const __m128i vk23xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 380 * sizeof(uint8_t)))), vk_zero_point);
+      i23 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi23x4567, vk23x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi23x89AB, vk23x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi23xCDEF, vk23xCDEF, vaccCDEF);
+
+      const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+      const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 4));
+      const __m128i vk24x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 388 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 8));
+      const __m128i vk24x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 12));
+      const __m128i vk24xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 396 * sizeof(uint8_t)))), vk_zero_point);
+      i24 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi24x0123, vk24x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi24x4567, vk24x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi24x89AB, vk24x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi24xCDEF, vk24xCDEF, vaccCDEF);
+
+      w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+      __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
+      __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+      vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
+      vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+      vacc89AB = _mm_cvtps_epi32(vscaled89AB);
+      vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+      __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
+
+      _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+      output += 16;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 80))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 96))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 112))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 128))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
+        const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+        const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 144))), vk_zero_point);
+        i9 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
+        const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+        const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 160))), vk_zero_point);
+        i10 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
+        const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+        const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 176))), vk_zero_point);
+        i11 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
+        const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+        const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 192))), vk_zero_point);
+        i12 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
+        const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+        const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 208))), vk_zero_point);
+        i13 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
+        const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+        const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 224))), vk_zero_point);
+        i14 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
+        const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+        const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 240))), vk_zero_point);
+        i15 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
+        const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+        const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 256))), vk_zero_point);
+        i16 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
+        const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+        const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 272))), vk_zero_point);
+        i17 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
+        const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+        const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 288))), vk_zero_point);
+        i18 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
+        const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+        const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 304))), vk_zero_point);
+        i19 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
+        const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+        const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 320))), vk_zero_point);
+        i20 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
+        const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+        const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 336))), vk_zero_point);
+        i21 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
+        const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+        const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 352))), vk_zero_point);
+        i22 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
+        const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+        const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 368))), vk_zero_point);
+        i23 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
+        const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+        const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 384))), vk_zero_point);
+        i24 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi24x0123, vk24x0123, vacc0123);
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c b/src/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
new file mode 100644
index 0000000..ade1361
--- /dev/null
+++ b/src/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c
@@ -0,0 +1,344 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 16; c -= 16) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+      __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
+      __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 8));
+      const __m128i vk0x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 12));
+      const __m128i vk0xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi0x4567, vk0x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi0x89AB, vk0x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi0xCDEF, vk0xCDEF));
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 8));
+      const __m128i vk1x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 12));
+      const __m128i vk1xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi1x4567, vk1x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi1x89AB, vk1x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi1xCDEF, vk1xCDEF));
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 8));
+      const __m128i vk2x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 12));
+      const __m128i vk2xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi2x4567, vk2x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi2x89AB, vk2x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi2xCDEF, vk2xCDEF));
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 8));
+      const __m128i vk3x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 12));
+      const __m128i vk3xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi3x4567, vk3x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi3x89AB, vk3x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi3xCDEF, vk3xCDEF));
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 8));
+      const __m128i vk4x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 12));
+      const __m128i vk4xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi4x4567, vk4x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi4x89AB, vk4x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi4xCDEF, vk4xCDEF));
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 8));
+      const __m128i vk5x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 12));
+      const __m128i vk5xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi5x4567, vk5x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi5x89AB, vk5x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi5xCDEF, vk5xCDEF));
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 8));
+      const __m128i vk6x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 12));
+      const __m128i vk6xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi6x4567, vk6x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi6x89AB, vk6x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi6xCDEF, vk6xCDEF));
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 8));
+      const __m128i vk7x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 12));
+      const __m128i vk7xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi7x4567, vk7x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi7x89AB, vk7x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi7xCDEF, vk7xCDEF));
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 8));
+      const __m128i vk8x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 12));
+      const __m128i vk8xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi8x4567, vk8x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi8x89AB, vk8x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi8xCDEF, vk8xCDEF));
+
+      w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+      __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
+      __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+      vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
+      vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+      vacc89AB = _mm_cvtps_epi32(vscaled89AB);
+      vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+      __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
+
+      _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+      output += 16;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 80))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 96))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 112))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 128))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c b/src/qu8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
new file mode 100644
index 0000000..1700722
--- /dev/null
+++ b/src/qu8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c
@@ -0,0 +1,344 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 16; c -= 16) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+      __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
+      __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 8));
+      const __m128i vk0x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 12));
+      const __m128i vk0xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi0x4567, vk0x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi0x89AB, vk0x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi0xCDEF, vk0xCDEF));
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 8));
+      const __m128i vk1x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 12));
+      const __m128i vk1xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi1x4567, vk1x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi1x89AB, vk1x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi1xCDEF, vk1xCDEF));
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 8));
+      const __m128i vk2x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 12));
+      const __m128i vk2xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi2x4567, vk2x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi2x89AB, vk2x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi2xCDEF, vk2xCDEF));
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 8));
+      const __m128i vk3x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 12));
+      const __m128i vk3xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi3x4567, vk3x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi3x89AB, vk3x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi3xCDEF, vk3xCDEF));
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 8));
+      const __m128i vk4x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 12));
+      const __m128i vk4xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi4x4567, vk4x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi4x89AB, vk4x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi4xCDEF, vk4xCDEF));
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 8));
+      const __m128i vk5x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 12));
+      const __m128i vk5xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi5x4567, vk5x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi5x89AB, vk5x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi5xCDEF, vk5xCDEF));
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 8));
+      const __m128i vk6x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 12));
+      const __m128i vk6xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi6x4567, vk6x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi6x89AB, vk6x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi6xCDEF, vk6xCDEF));
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 8));
+      const __m128i vk7x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 12));
+      const __m128i vk7xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi7x4567, vk7x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi7x89AB, vk7x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi7xCDEF, vk7xCDEF));
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 8));
+      const __m128i vk8x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 12));
+      const __m128i vk8xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 16;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi8x4567, vk8x4567));
+      vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vi8x89AB, vk8x89AB));
+      vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vi8xCDEF, vk8xCDEF));
+
+      w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+      __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
+      __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+      vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
+      vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+      vacc89AB = _mm_cvtps_epi32(vscaled89AB);
+      vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+      __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
+
+      _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+      output += 16;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 80))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 96))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 112))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 128))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c b/src/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
new file mode 100644
index 0000000..1a79630
--- /dev/null
+++ b/src/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
@@ -0,0 +1,349 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 16; c -= 16) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+      __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
+      __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 8));
+      const __m128i vk0x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 12));
+      const __m128i vk0xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi0x4567, vk0x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi0x89AB, vk0x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi0xCDEF, vk0xCDEF, vaccCDEF);
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 8));
+      const __m128i vk1x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 12));
+      const __m128i vk1xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi1x4567, vk1x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi1x89AB, vk1x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi1xCDEF, vk1xCDEF, vaccCDEF);
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 8));
+      const __m128i vk2x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 12));
+      const __m128i vk2xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi2x4567, vk2x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi2x89AB, vk2x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi2xCDEF, vk2xCDEF, vaccCDEF);
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 8));
+      const __m128i vk3x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 12));
+      const __m128i vk3xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi3x4567, vk3x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi3x89AB, vk3x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi3xCDEF, vk3xCDEF, vaccCDEF);
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 8));
+      const __m128i vk4x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 12));
+      const __m128i vk4xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 76 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi4x4567, vk4x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi4x89AB, vk4x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi4xCDEF, vk4xCDEF, vaccCDEF);
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 84 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 8));
+      const __m128i vk5x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 12));
+      const __m128i vk5xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 92 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi5x4567, vk5x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi5x89AB, vk5x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi5xCDEF, vk5xCDEF, vaccCDEF);
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 100 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 8));
+      const __m128i vk6x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 12));
+      const __m128i vk6xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 108 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi6x4567, vk6x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi6x89AB, vk6x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi6xCDEF, vk6xCDEF, vaccCDEF);
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 116 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 8));
+      const __m128i vk7x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 12));
+      const __m128i vk7xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 124 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi7x4567, vk7x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi7x89AB, vk7x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi7xCDEF, vk7xCDEF, vaccCDEF);
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 132 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x89AB = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 8));
+      const __m128i vk8x89AB = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8xCDEF = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 12));
+      const __m128i vk8xCDEF = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 140 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 16;
+
+      vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi8x4567, vk8x4567, vacc4567);
+      vacc89AB = _mm_macc_epi32(vi8x89AB, vk8x89AB, vacc89AB);
+      vaccCDEF = _mm_macc_epi32(vi8xCDEF, vk8xCDEF, vaccCDEF);
+
+      w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+      __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
+      __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+      vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
+      vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+      vacc89AB = _mm_cvtps_epi32(vscaled89AB);
+      vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+      __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
+
+      _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+      output += 16;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 80))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 96))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 112))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 128))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c b/src/qu8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
new file mode 100644
index 0000000..ba7490c
--- /dev/null
+++ b/src/qu8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c
@@ -0,0 +1,585 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    const uint8_t* i9 = input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != zero) {
+      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
+    }
+    const uint8_t* i10 = input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != zero) {
+      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
+    }
+    const uint8_t* i11 = input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != zero) {
+      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
+    }
+    const uint8_t* i12 = input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != zero) {
+      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
+    }
+    const uint8_t* i13 = input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != zero) {
+      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
+    }
+    const uint8_t* i14 = input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != zero) {
+      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
+    }
+    const uint8_t* i15 = input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != zero) {
+      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
+    }
+    const uint8_t* i16 = input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != zero) {
+      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
+    }
+    const uint8_t* i17 = input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != zero) {
+      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
+    }
+    const uint8_t* i18 = input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != zero) {
+      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
+    }
+    const uint8_t* i19 = input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != zero) {
+      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
+    }
+    const uint8_t* i20 = input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != zero) {
+      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
+    }
+    const uint8_t* i21 = input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != zero) {
+      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
+    }
+    const uint8_t* i22 = input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != zero) {
+      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
+    }
+    const uint8_t* i23 = input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != zero) {
+      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
+    }
+    const uint8_t* i24 = input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != zero) {
+      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi0x4567, vk0x4567));
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi1x4567, vk1x4567));
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi2x4567, vk2x4567));
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi3x4567, vk3x4567));
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi4x4567, vk4x4567));
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi5x4567, vk5x4567));
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi6x4567, vk6x4567));
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi7x4567, vk7x4567));
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi8x4567, vk8x4567));
+
+      const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+      const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 4));
+      const __m128i vk9x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(uint8_t)))), vk_zero_point);
+      i9 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi9x4567, vk9x4567));
+
+      const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+      const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 4));
+      const __m128i vk10x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(uint8_t)))), vk_zero_point);
+      i10 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi10x4567, vk10x4567));
+
+      const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+      const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 4));
+      const __m128i vk11x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(uint8_t)))), vk_zero_point);
+      i11 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi11x4567, vk11x4567));
+
+      const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+      const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 4));
+      const __m128i vk12x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(uint8_t)))), vk_zero_point);
+      i12 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi12x4567, vk12x4567));
+
+      const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+      const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 4));
+      const __m128i vk13x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(uint8_t)))), vk_zero_point);
+      i13 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi13x4567, vk13x4567));
+
+      const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+      const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 4));
+      const __m128i vk14x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(uint8_t)))), vk_zero_point);
+      i14 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi14x4567, vk14x4567));
+
+      const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+      const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 4));
+      const __m128i vk15x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(uint8_t)))), vk_zero_point);
+      i15 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi15x4567, vk15x4567));
+
+      const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+      const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 4));
+      const __m128i vk16x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(uint8_t)))), vk_zero_point);
+      i16 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi16x4567, vk16x4567));
+
+      const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+      const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 4));
+      const __m128i vk17x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(uint8_t)))), vk_zero_point);
+      i17 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi17x4567, vk17x4567));
+
+      const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+      const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 4));
+      const __m128i vk18x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(uint8_t)))), vk_zero_point);
+      i18 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi18x4567, vk18x4567));
+
+      const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+      const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 4));
+      const __m128i vk19x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(uint8_t)))), vk_zero_point);
+      i19 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi19x4567, vk19x4567));
+
+      const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+      const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 4));
+      const __m128i vk20x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(uint8_t)))), vk_zero_point);
+      i20 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi20x4567, vk20x4567));
+
+      const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+      const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 4));
+      const __m128i vk21x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(uint8_t)))), vk_zero_point);
+      i21 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi21x4567, vk21x4567));
+
+      const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+      const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 4));
+      const __m128i vk22x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(uint8_t)))), vk_zero_point);
+      i22 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi22x4567, vk22x4567));
+
+      const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+      const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 4));
+      const __m128i vk23x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(uint8_t)))), vk_zero_point);
+      i23 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi23x4567, vk23x4567));
+
+      const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+      const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 4));
+      const __m128i vk24x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(uint8_t)))), vk_zero_point);
+      i24 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi24x0123, vk24x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi24x4567, vk24x4567));
+
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
+
+      _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+      output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 8);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 8))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 24))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 40))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 56))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+        const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+        const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 72))), vk_zero_point);
+        i9 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
+        const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+        const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 80))), vk_zero_point);
+        i10 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
+        const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+        const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 88))), vk_zero_point);
+        i11 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
+        const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+        const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 96))), vk_zero_point);
+        i12 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
+        const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+        const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 104))), vk_zero_point);
+        i13 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
+        const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+        const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 112))), vk_zero_point);
+        i14 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
+        const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+        const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 120))), vk_zero_point);
+        i15 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
+        const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+        const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 128))), vk_zero_point);
+        i16 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
+        const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+        const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 136))), vk_zero_point);
+        i17 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
+        const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+        const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 144))), vk_zero_point);
+        i18 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
+        const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+        const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 152))), vk_zero_point);
+        i19 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
+        const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+        const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 160))), vk_zero_point);
+        i20 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
+        const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+        const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 168))), vk_zero_point);
+        i21 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
+        const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+        const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 176))), vk_zero_point);
+        i22 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
+        const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+        const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 184))), vk_zero_point);
+        i23 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
+        const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+        const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 192))), vk_zero_point);
+        i24 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi24x0123, vk24x0123));
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c b/src/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
new file mode 100644
index 0000000..be6ed54
--- /dev/null
+++ b/src/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c
@@ -0,0 +1,585 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    const uint8_t* i9 = input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != zero) {
+      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
+    }
+    const uint8_t* i10 = input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != zero) {
+      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
+    }
+    const uint8_t* i11 = input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != zero) {
+      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
+    }
+    const uint8_t* i12 = input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != zero) {
+      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
+    }
+    const uint8_t* i13 = input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != zero) {
+      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
+    }
+    const uint8_t* i14 = input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != zero) {
+      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
+    }
+    const uint8_t* i15 = input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != zero) {
+      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
+    }
+    const uint8_t* i16 = input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != zero) {
+      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
+    }
+    const uint8_t* i17 = input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != zero) {
+      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
+    }
+    const uint8_t* i18 = input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != zero) {
+      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
+    }
+    const uint8_t* i19 = input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != zero) {
+      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
+    }
+    const uint8_t* i20 = input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != zero) {
+      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
+    }
+    const uint8_t* i21 = input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != zero) {
+      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
+    }
+    const uint8_t* i22 = input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != zero) {
+      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
+    }
+    const uint8_t* i23 = input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != zero) {
+      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
+    }
+    const uint8_t* i24 = input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != zero) {
+      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi0x4567, vk0x4567));
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi1x4567, vk1x4567));
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi2x4567, vk2x4567));
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi3x4567, vk3x4567));
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi4x4567, vk4x4567));
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi5x4567, vk5x4567));
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi6x4567, vk6x4567));
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi7x4567, vk7x4567));
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi8x4567, vk8x4567));
+
+      const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+      const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 4));
+      const __m128i vk9x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(uint8_t)))), vk_zero_point);
+      i9 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi9x4567, vk9x4567));
+
+      const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+      const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 4));
+      const __m128i vk10x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(uint8_t)))), vk_zero_point);
+      i10 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi10x4567, vk10x4567));
+
+      const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+      const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 4));
+      const __m128i vk11x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(uint8_t)))), vk_zero_point);
+      i11 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi11x4567, vk11x4567));
+
+      const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+      const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 4));
+      const __m128i vk12x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(uint8_t)))), vk_zero_point);
+      i12 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi12x4567, vk12x4567));
+
+      const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+      const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 4));
+      const __m128i vk13x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(uint8_t)))), vk_zero_point);
+      i13 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi13x4567, vk13x4567));
+
+      const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+      const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 4));
+      const __m128i vk14x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(uint8_t)))), vk_zero_point);
+      i14 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi14x4567, vk14x4567));
+
+      const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+      const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 4));
+      const __m128i vk15x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(uint8_t)))), vk_zero_point);
+      i15 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi15x4567, vk15x4567));
+
+      const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+      const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 4));
+      const __m128i vk16x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(uint8_t)))), vk_zero_point);
+      i16 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi16x4567, vk16x4567));
+
+      const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+      const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 4));
+      const __m128i vk17x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(uint8_t)))), vk_zero_point);
+      i17 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi17x4567, vk17x4567));
+
+      const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+      const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 4));
+      const __m128i vk18x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(uint8_t)))), vk_zero_point);
+      i18 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi18x4567, vk18x4567));
+
+      const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+      const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 4));
+      const __m128i vk19x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(uint8_t)))), vk_zero_point);
+      i19 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi19x4567, vk19x4567));
+
+      const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+      const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 4));
+      const __m128i vk20x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(uint8_t)))), vk_zero_point);
+      i20 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi20x4567, vk20x4567));
+
+      const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+      const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 4));
+      const __m128i vk21x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(uint8_t)))), vk_zero_point);
+      i21 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi21x4567, vk21x4567));
+
+      const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+      const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 4));
+      const __m128i vk22x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(uint8_t)))), vk_zero_point);
+      i22 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi22x4567, vk22x4567));
+
+      const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+      const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 4));
+      const __m128i vk23x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(uint8_t)))), vk_zero_point);
+      i23 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi23x4567, vk23x4567));
+
+      const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+      const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 4));
+      const __m128i vk24x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(uint8_t)))), vk_zero_point);
+      i24 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi24x0123, vk24x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi24x4567, vk24x4567));
+
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
+
+      _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+      output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 8);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 8))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 24))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 40))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 56))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+        const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+        const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 72))), vk_zero_point);
+        i9 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi9x0123, vk9x0123));
+        const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+        const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 80))), vk_zero_point);
+        i10 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi10x0123, vk10x0123));
+        const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+        const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 88))), vk_zero_point);
+        i11 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi11x0123, vk11x0123));
+        const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+        const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 96))), vk_zero_point);
+        i12 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi12x0123, vk12x0123));
+        const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+        const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 104))), vk_zero_point);
+        i13 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi13x0123, vk13x0123));
+        const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+        const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 112))), vk_zero_point);
+        i14 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi14x0123, vk14x0123));
+        const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+        const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 120))), vk_zero_point);
+        i15 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi15x0123, vk15x0123));
+        const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+        const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 128))), vk_zero_point);
+        i16 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi16x0123, vk16x0123));
+        const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+        const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 136))), vk_zero_point);
+        i17 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi17x0123, vk17x0123));
+        const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+        const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 144))), vk_zero_point);
+        i18 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi18x0123, vk18x0123));
+        const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+        const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 152))), vk_zero_point);
+        i19 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi19x0123, vk19x0123));
+        const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+        const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 160))), vk_zero_point);
+        i20 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi20x0123, vk20x0123));
+        const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+        const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 168))), vk_zero_point);
+        i21 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi21x0123, vk21x0123));
+        const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+        const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 176))), vk_zero_point);
+        i22 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi22x0123, vk22x0123));
+        const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+        const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 184))), vk_zero_point);
+        i23 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi23x0123, vk23x0123));
+        const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+        const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 192))), vk_zero_point);
+        i24 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi24x0123, vk24x0123));
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c b/src/qu8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
new file mode 100644
index 0000000..20cc808
--- /dev/null
+++ b/src/qu8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c
@@ -0,0 +1,590 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    const uint8_t* i9 = input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != zero) {
+      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
+    }
+    const uint8_t* i10 = input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != zero) {
+      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
+    }
+    const uint8_t* i11 = input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != zero) {
+      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
+    }
+    const uint8_t* i12 = input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != zero) {
+      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
+    }
+    const uint8_t* i13 = input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != zero) {
+      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
+    }
+    const uint8_t* i14 = input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != zero) {
+      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
+    }
+    const uint8_t* i15 = input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != zero) {
+      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
+    }
+    const uint8_t* i16 = input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != zero) {
+      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
+    }
+    const uint8_t* i17 = input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != zero) {
+      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
+    }
+    const uint8_t* i18 = input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != zero) {
+      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
+    }
+    const uint8_t* i19 = input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != zero) {
+      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
+    }
+    const uint8_t* i20 = input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != zero) {
+      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
+    }
+    const uint8_t* i21 = input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != zero) {
+      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
+    }
+    const uint8_t* i22 = input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != zero) {
+      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
+    }
+    const uint8_t* i23 = input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != zero) {
+      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
+    }
+    const uint8_t* i24 = input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != zero) {
+      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi0x4567, vk0x4567, vacc4567);
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi1x4567, vk1x4567, vacc4567);
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi2x4567, vk2x4567, vacc4567);
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi3x4567, vk3x4567, vacc4567);
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi4x4567, vk4x4567, vacc4567);
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi5x4567, vk5x4567, vacc4567);
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi6x4567, vk6x4567, vacc4567);
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi7x4567, vk7x4567, vacc4567);
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi8x4567, vk8x4567, vacc4567);
+
+      const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+      const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi9x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9 + 4));
+      const __m128i vk9x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 76 * sizeof(uint8_t)))), vk_zero_point);
+      i9 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi9x4567, vk9x4567, vacc4567);
+
+      const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+      const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi10x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10 + 4));
+      const __m128i vk10x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 84 * sizeof(uint8_t)))), vk_zero_point);
+      i10 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi10x4567, vk10x4567, vacc4567);
+
+      const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+      const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi11x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11 + 4));
+      const __m128i vk11x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 92 * sizeof(uint8_t)))), vk_zero_point);
+      i11 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi11x4567, vk11x4567, vacc4567);
+
+      const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+      const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi12x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12 + 4));
+      const __m128i vk12x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 100 * sizeof(uint8_t)))), vk_zero_point);
+      i12 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi12x4567, vk12x4567, vacc4567);
+
+      const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+      const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi13x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13 + 4));
+      const __m128i vk13x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 108 * sizeof(uint8_t)))), vk_zero_point);
+      i13 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi13x4567, vk13x4567, vacc4567);
+
+      const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+      const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi14x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14 + 4));
+      const __m128i vk14x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 116 * sizeof(uint8_t)))), vk_zero_point);
+      i14 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi14x4567, vk14x4567, vacc4567);
+
+      const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+      const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi15x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15 + 4));
+      const __m128i vk15x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 124 * sizeof(uint8_t)))), vk_zero_point);
+      i15 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi15x4567, vk15x4567, vacc4567);
+
+      const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+      const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi16x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16 + 4));
+      const __m128i vk16x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 132 * sizeof(uint8_t)))), vk_zero_point);
+      i16 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi16x4567, vk16x4567, vacc4567);
+
+      const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+      const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi17x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17 + 4));
+      const __m128i vk17x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 140 * sizeof(uint8_t)))), vk_zero_point);
+      i17 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi17x4567, vk17x4567, vacc4567);
+
+      const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+      const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi18x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18 + 4));
+      const __m128i vk18x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 148 * sizeof(uint8_t)))), vk_zero_point);
+      i18 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi18x4567, vk18x4567, vacc4567);
+
+      const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+      const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi19x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19 + 4));
+      const __m128i vk19x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 156 * sizeof(uint8_t)))), vk_zero_point);
+      i19 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi19x4567, vk19x4567, vacc4567);
+
+      const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+      const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi20x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20 + 4));
+      const __m128i vk20x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 164 * sizeof(uint8_t)))), vk_zero_point);
+      i20 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi20x4567, vk20x4567, vacc4567);
+
+      const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+      const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi21x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21 + 4));
+      const __m128i vk21x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 172 * sizeof(uint8_t)))), vk_zero_point);
+      i21 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi21x4567, vk21x4567, vacc4567);
+
+      const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+      const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi22x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22 + 4));
+      const __m128i vk22x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 180 * sizeof(uint8_t)))), vk_zero_point);
+      i22 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi22x4567, vk22x4567, vacc4567);
+
+      const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+      const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi23x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23 + 4));
+      const __m128i vk23x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 188 * sizeof(uint8_t)))), vk_zero_point);
+      i23 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi23x4567, vk23x4567, vacc4567);
+
+      const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+      const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi24x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24 + 4));
+      const __m128i vk24x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 196 * sizeof(uint8_t)))), vk_zero_point);
+      i24 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi24x0123, vk24x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi24x4567, vk24x4567, vacc4567);
+
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
+
+      _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+      output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 8);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 8))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 24))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 40))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 56))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
+        const __m128i vi9x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i9));
+        const __m128i vk9x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 72))), vk_zero_point);
+        i9 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi9x0123, vk9x0123, vacc0123);
+        const __m128i vi10x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i10));
+        const __m128i vk10x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 80))), vk_zero_point);
+        i10 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi10x0123, vk10x0123, vacc0123);
+        const __m128i vi11x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i11));
+        const __m128i vk11x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 88))), vk_zero_point);
+        i11 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi11x0123, vk11x0123, vacc0123);
+        const __m128i vi12x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i12));
+        const __m128i vk12x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 96))), vk_zero_point);
+        i12 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi12x0123, vk12x0123, vacc0123);
+        const __m128i vi13x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i13));
+        const __m128i vk13x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 104))), vk_zero_point);
+        i13 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi13x0123, vk13x0123, vacc0123);
+        const __m128i vi14x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i14));
+        const __m128i vk14x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 112))), vk_zero_point);
+        i14 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi14x0123, vk14x0123, vacc0123);
+        const __m128i vi15x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i15));
+        const __m128i vk15x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 120))), vk_zero_point);
+        i15 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi15x0123, vk15x0123, vacc0123);
+        const __m128i vi16x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i16));
+        const __m128i vk16x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 128))), vk_zero_point);
+        i16 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi16x0123, vk16x0123, vacc0123);
+        const __m128i vi17x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i17));
+        const __m128i vk17x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 136))), vk_zero_point);
+        i17 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi17x0123, vk17x0123, vacc0123);
+        const __m128i vi18x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i18));
+        const __m128i vk18x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 144))), vk_zero_point);
+        i18 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi18x0123, vk18x0123, vacc0123);
+        const __m128i vi19x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i19));
+        const __m128i vk19x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 152))), vk_zero_point);
+        i19 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi19x0123, vk19x0123, vacc0123);
+        const __m128i vi20x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i20));
+        const __m128i vk20x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 160))), vk_zero_point);
+        i20 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi20x0123, vk20x0123, vacc0123);
+        const __m128i vi21x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i21));
+        const __m128i vk21x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 168))), vk_zero_point);
+        i21 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi21x0123, vk21x0123, vacc0123);
+        const __m128i vi22x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i22));
+        const __m128i vk22x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 176))), vk_zero_point);
+        i22 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi22x0123, vk22x0123, vacc0123);
+        const __m128i vi23x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i23));
+        const __m128i vk23x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 184))), vk_zero_point);
+        i23 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi23x0123, vk23x0123, vacc0123);
+        const __m128i vi24x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i24));
+        const __m128i vk24x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 192))), vk_zero_point);
+        i24 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi24x0123, vk24x0123, vacc0123);
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c b/src/qu8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
new file mode 100644
index 0000000..a1ca2bb
--- /dev/null
+++ b/src/qu8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c
@@ -0,0 +1,281 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi0x4567, vk0x4567));
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi1x4567, vk1x4567));
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi2x4567, vk2x4567));
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi3x4567, vk3x4567));
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi4x4567, vk4x4567));
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi5x4567, vk5x4567));
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi6x4567, vk6x4567));
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi7x4567, vk7x4567));
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi8x4567, vk8x4567));
+
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
+
+      _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+      output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 8);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 8))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 24))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 40))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 56))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c b/src/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
new file mode 100644
index 0000000..d25e6d9
--- /dev/null
+++ b/src/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c
@@ -0,0 +1,281 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi0x4567, vk0x4567));
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi1x4567, vk1x4567));
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi2x4567, vk2x4567));
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi3x4567, vk3x4567));
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi4x4567, vk4x4567));
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi5x4567, vk5x4567));
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi6x4567, vk6x4567));
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi7x4567, vk7x4567));
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 8;
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vi8x4567, vk8x4567));
+
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
+
+      _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+      output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 8);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi0x0123, vk0x0123));
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 8))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi1x0123, vk1x0123));
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi2x0123, vk2x0123));
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 24))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi3x0123, vk3x0123));
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi4x0123, vk4x0123));
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 40))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi5x0123, vk5x0123));
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi6x0123, vk6x0123));
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 56))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi7x0123, vk7x0123));
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vi8x0123, vk8x0123));
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/qu8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c b/src/qu8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
new file mode 100644
index 0000000..2e610ae
--- /dev/null
+++ b/src/qu8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c
@@ -0,0 +1,286 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-dwconv/unipass-sse-mul32.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/intrinsics-polyfill.h>
+
+
+void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32(
+    size_t channels,
+    size_t output_width,
+    const uint8_t** input,
+    const void* weights,
+    uint8_t* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const uint8_t* zero,
+    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128i vk_zero_point = _mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*) params->fp32_sse2.kernel_zero_point));
+  do {
+    const uint8_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint8_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint8_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint8_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint8_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint8_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint8_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint8_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint8_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+    }
+    input = (const uint8_t**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const void* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+
+
+      const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+      const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi0x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0 + 4));
+      const __m128i vk0x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 4 * sizeof(uint8_t)))), vk_zero_point);
+      i0 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi0x4567, vk0x4567, vacc4567);
+
+      const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+      const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi1x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1 + 4));
+      const __m128i vk1x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 12 * sizeof(uint8_t)))), vk_zero_point);
+      i1 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi1x4567, vk1x4567, vacc4567);
+
+      const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+      const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi2x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2 + 4));
+      const __m128i vk2x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 20 * sizeof(uint8_t)))), vk_zero_point);
+      i2 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi2x4567, vk2x4567, vacc4567);
+
+      const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+      const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi3x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3 + 4));
+      const __m128i vk3x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 28 * sizeof(uint8_t)))), vk_zero_point);
+      i3 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi3x4567, vk3x4567, vacc4567);
+
+      const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+      const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi4x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4 + 4));
+      const __m128i vk4x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 36 * sizeof(uint8_t)))), vk_zero_point);
+      i4 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi4x4567, vk4x4567, vacc4567);
+
+      const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+      const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi5x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5 + 4));
+      const __m128i vk5x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 44 * sizeof(uint8_t)))), vk_zero_point);
+      i5 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi5x4567, vk5x4567, vacc4567);
+
+      const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+      const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi6x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6 + 4));
+      const __m128i vk6x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 52 * sizeof(uint8_t)))), vk_zero_point);
+      i6 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi6x4567, vk6x4567, vacc4567);
+
+      const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+      const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi7x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7 + 4));
+      const __m128i vk7x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 60 * sizeof(uint8_t)))), vk_zero_point);
+      i7 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi7x4567, vk7x4567, vacc4567);
+
+      const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+      const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)))), vk_zero_point);
+      const __m128i vi8x4567 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8 + 4));
+      const __m128i vk8x4567 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 68 * sizeof(uint8_t)))), vk_zero_point);
+      i8 += 8;
+
+      vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
+      vacc4567 = _mm_macc_epi32(vi8x4567, vk8x4567, vacc4567);
+
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t));
+
+      __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+      __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
+
+      const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+      vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
+      vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
+
+      vacc0123 = _mm_cvtps_epi32(vscaled0123);
+      vacc4567 = _mm_cvtps_epi32(vscaled4567);
+
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+      __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
+
+      _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+      output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 8);
+      do {
+        __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
+
+        const __m128i vi0x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i0));
+        const __m128i vk0x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32(k)), vk_zero_point);
+        i0 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi0x0123, vk0x0123, vacc0123);
+        const __m128i vi1x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i1));
+        const __m128i vk1x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 8))), vk_zero_point);
+        i1 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi1x0123, vk1x0123, vacc0123);
+        const __m128i vi2x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i2));
+        const __m128i vk2x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 16))), vk_zero_point);
+        i2 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi2x0123, vk2x0123, vacc0123);
+        const __m128i vi3x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i3));
+        const __m128i vk3x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 24))), vk_zero_point);
+        i3 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi3x0123, vk3x0123, vacc0123);
+        const __m128i vi4x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i4));
+        const __m128i vk4x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 32))), vk_zero_point);
+        i4 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi4x0123, vk4x0123, vacc0123);
+        const __m128i vi5x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i5));
+        const __m128i vk5x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 40))), vk_zero_point);
+        i5 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi5x0123, vk5x0123, vacc0123);
+        const __m128i vi6x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i6));
+        const __m128i vk6x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 48))), vk_zero_point);
+        i6 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi6x0123, vk6x0123, vacc0123);
+        const __m128i vi7x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i7));
+        const __m128i vk7x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 56))), vk_zero_point);
+        i7 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi7x0123, vk7x0123, vacc0123);
+        const __m128i vi8x0123 = _mm_cvtepu8_epi32(_mm_loadu_si32(i8));
+        const __m128i vk8x0123 = _mm_sub_epi32(_mm_cvtepu8_epi32(_mm_loadu_si32((const void*) (k + 64))), vk_zero_point);
+        i8 += 4;
+
+        vacc0123 = _mm_macc_epi32(vi8x0123, vk8x0123, vacc0123);
+
+        k += 4;
+
+        __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
+        vscaled0123 = _mm_mul_ps(vscaled0123, _mm_load_ps(params->fp32_sse2.scale));
+        vacc0123 = _mm_cvtps_epi32(vscaled0123);
+
+        w = (const void*) ((const int32_t*) w + 4);
+
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+        __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
+
+        vout0123 = _mm_packus_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
+        vout0123 = _mm_min_epu8(vout0123, _mm_load_si128((const __m128i*) params->fp32_sse2.output_max));
+
+        if XNN_LIKELY(c >= 4) {
+          _mm_storeu_si32(output, vout0123);
+          output += 4;
+          c -= 4;
+        } else {
+          if (c & 2) {
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123, 0);
+            vout0123 = _mm_srli_epi32(vout0123, 16);
+            output += 2;
+          }
+          if (c & 1) {
+            *output = (uint8_t) _mm_extract_epi8(vout0123, 0);
+            output += 1;
+          }
+          c = 0;
+        }
+      } while (c != 0);
+    }
+
+    output = (uint8_t*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index b09b9e6..f76d097 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -276,6 +276,24 @@
 DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_gemmlowp_ukernel_up8x9__neon)
 DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_gemmlowp_ukernel_up8x9__sse2)
 
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32)
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32)
+
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32)
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32)
+
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32)
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32)
+
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32)
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32)
+
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32)
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32)
+
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32)
+DECLARE_QU8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32)
+
 
 #define DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                                        \
diff --git a/test/qu8-dwconv-minmax-fp32.cc b/test/qu8-dwconv-minmax-fp32.cc
new file mode 100644
index 0000000..6971c65
--- /dev/null
+++ b/test/qu8-dwconv-minmax-fp32.cc
@@ -0,0 +1,2588 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: test/qu8-dwconv-minmax-fp32.yaml
+//   Generator: tools/generate-dwconv-test.py
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/dwconv.h>
+#include "dwconv-microkernel-tester.h"
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, c_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, c_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, c_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, c_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, multipixel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, input_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__SSE41_MUL32, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, c_eq_16) {
+    TEST_REQUIRES_X86_SSE41;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(9)
+      .channels(16)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, c_div_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, c_div_16_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, c_div_16_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, c_lt_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, c_gt_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, c_gt_16_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, c_gt_16_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, multipixel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, input_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__SSE41_MUL32, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, c_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(25)
+      .channels(8)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, c_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, c_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, c_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, multipixel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, input_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__SSE41_MUL32, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, c_eq_16) {
+    TEST_REQUIRES_X86_SSE41;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(25)
+      .channels(16)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, c_div_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, c_div_16_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, c_div_16_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, c_lt_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, c_gt_16) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, c_gt_16_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, c_gt_16_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, multipixel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, input_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__SSE41_MUL32, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, c_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, c_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, c_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, c_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, multipixel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, input_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__AVX_MUL32, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, c_eq_16) {
+    TEST_REQUIRES_X86_AVX;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(9)
+      .channels(16)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, c_div_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, c_div_16_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, c_div_16_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, c_lt_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, c_gt_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, c_gt_16_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, c_gt_16_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, multipixel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, input_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__AVX_MUL32, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, c_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(25)
+      .channels(8)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, c_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, c_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, c_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, multipixel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, input_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__AVX_MUL32, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, c_eq_16) {
+    TEST_REQUIRES_X86_AVX;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(25)
+      .channels(16)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, c_div_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, c_div_16_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, c_div_16_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, c_lt_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, c_gt_16) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, c_gt_16_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, c_gt_16_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, multipixel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, input_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__AVX_MUL32, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, c_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, c_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, c_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, c_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, multipixel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, input_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X9__XOP_MUL32, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, c_eq_16) {
+    TEST_REQUIRES_X86_XOP;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(9)
+      .channels(16)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, c_div_16) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, c_div_16_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, c_div_16_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, c_lt_16) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, c_gt_16) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, c_gt_16_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, c_gt_16_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, multipixel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, input_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X9__XOP_MUL32, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, c_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(25)
+      .channels(8)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, c_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, c_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, c_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, multipixel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, input_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP8X25__XOP_MUL32, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, c_eq_16) {
+    TEST_REQUIRES_X86_XOP;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(25)
+      .channels(16)
+      .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, c_div_16) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, c_div_16_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, c_div_16_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, c_lt_16) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, c_gt_16) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, c_gt_16_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, c_gt_16_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, multipixel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, multipixel_with_step) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, input_zero_point_only) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(255)
+        .kernel_zero_point(0)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, kernel_zero_point_only) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .input_zero_point(0)
+        .kernel_zero_point(255)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, input_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+    }
+  }
+
+  TEST(QU8_DWCONV_MINMAX_FP32_UP16X25__XOP_MUL32, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/test/qu8-dwconv-minmax-fp32.yaml b/test/qu8-dwconv-minmax-fp32.yaml
new file mode 100644
index 0000000..8fc3594
--- /dev/null
+++ b/test/qu8-dwconv-minmax-fp32.yaml
@@ -0,0 +1,32 @@
+# Copyright 2021 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright 2021 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params
+- name: xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32
+  init: xnn_init_qu8_conv_minmax_fp32_sse2_params