Make SSE2 microkernels consistent with neon zip microkernels.
 - DEC is now MOV

PiperOrigin-RevId: 425319775
diff --git a/BUILD.bazel b/BUILD.bazel
index bdcc0a2..4d2271e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -5179,22 +5179,22 @@
     "src/u8-maxpool/9p8x-minmax-sse2-c16.c",
     "src/u8-rmax/sse2.c",
     "src/u8-vclamp/sse2-x64.c",
-    "src/x8-transpose/gen/16x16-reuse-dec-sse2.c",
+    "src/x8-transpose/gen/16x16-reuse-mov-sse2.c",
     "src/x8-transpose/gen/16x16-reuse-switch-sse2.c",
     "src/x8-zip/x2-sse2.c",
     "src/x8-zip/x3-sse2.c",
     "src/x8-zip/x4-sse2.c",
     "src/x8-zip/xm-sse2.c",
     "src/x16-transpose/4x8-sse2.c",
-    "src/x16-transpose/gen/8x8-multi-dec-sse2.c",
+    "src/x16-transpose/gen/8x8-multi-mov-sse2.c",
     "src/x16-transpose/gen/8x8-multi-switch-sse2.c",
-    "src/x16-transpose/gen/8x8-reuse-dec-sse2.c",
+    "src/x16-transpose/gen/8x8-reuse-mov-sse2.c",
     "src/x16-transpose/gen/8x8-reuse-multi-sse2.c",
     "src/x16-transpose/gen/8x8-reuse-switch-sse2.c",
-    "src/x32-transpose/gen/4x4-multi-dec-sse2.c",
+    "src/x32-transpose/gen/4x4-multi-mov-sse2.c",
     "src/x32-transpose/gen/4x4-multi-multi-sse2.c",
     "src/x32-transpose/gen/4x4-multi-switch-sse2.c",
-    "src/x32-transpose/gen/4x4-reuse-dec-sse2.c",
+    "src/x32-transpose/gen/4x4-reuse-mov-sse2.c",
     "src/x32-transpose/gen/4x4-reuse-multi-sse2.c",
     "src/x32-transpose/gen/4x4-reuse-switch-sse2.c",
     "src/x32-unpool/sse2.c",
@@ -5202,10 +5202,10 @@
     "src/x32-zip/x3-sse2.c",
     "src/x32-zip/x4-sse2.c",
     "src/x32-zip/xm-sse2.c",
-    "src/x64-transpose/gen/2x2-multi-dec-sse2.c",
+    "src/x64-transpose/gen/2x2-multi-mov-sse2.c",
     "src/x64-transpose/gen/2x2-multi-multi-sse2.c",
     "src/x64-transpose/gen/2x2-multi-switch-sse2.c",
-    "src/x64-transpose/gen/2x2-reuse-dec-sse2.c",
+    "src/x64-transpose/gen/2x2-reuse-mov-sse2.c",
     "src/x64-transpose/gen/2x2-reuse-multi-sse2.c",
     "src/x64-transpose/gen/2x2-reuse-switch-sse2.c",
     "src/xx-fill/sse2-x64.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f44a57..5c37ae0 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3941,22 +3941,22 @@
   src/u8-maxpool/9p8x-minmax-sse2-c16.c
   src/u8-rmax/sse2.c
   src/u8-vclamp/sse2-x64.c
-  src/x8-transpose/gen/16x16-reuse-dec-sse2.c
+  src/x8-transpose/gen/16x16-reuse-mov-sse2.c
   src/x8-transpose/gen/16x16-reuse-switch-sse2.c
   src/x8-zip/x2-sse2.c
   src/x8-zip/x3-sse2.c
   src/x8-zip/x4-sse2.c
   src/x8-zip/xm-sse2.c
   src/x16-transpose/4x8-sse2.c
-  src/x16-transpose/gen/8x8-multi-dec-sse2.c
+  src/x16-transpose/gen/8x8-multi-mov-sse2.c
   src/x16-transpose/gen/8x8-multi-switch-sse2.c
-  src/x16-transpose/gen/8x8-reuse-dec-sse2.c
+  src/x16-transpose/gen/8x8-reuse-mov-sse2.c
   src/x16-transpose/gen/8x8-reuse-multi-sse2.c
   src/x16-transpose/gen/8x8-reuse-switch-sse2.c
-  src/x32-transpose/gen/4x4-multi-dec-sse2.c
+  src/x32-transpose/gen/4x4-multi-momovse2.c
   src/x32-transpose/gen/4x4-multi-multi-sse2.c
   src/x32-transpose/gen/4x4-multi-switch-sse2.c
-  src/x32-transpose/gen/4x4-reuse-dec-sse2.c
+  src/x32-transpose/gen/4x4-reuse-mov-sse2.c
   src/x32-transpose/gen/4x4-reuse-multi-sse2.c
   src/x32-transpose/gen/4x4-reuse-switch-sse2.c
   src/x32-unpool/sse2.c
@@ -3964,10 +3964,10 @@
   src/x32-zip/x3-sse2.c
   src/x32-zip/x4-sse2.c
   src/x32-zip/xm-sse2.c
-  src/x64-transpose/gen/2x2-multi-dec-sse2.c
+  src/x64-transpose/gen/2x2-multi-mov-sse2.c
   src/x64-transpose/gen/2x2-multi-multi-sse2.c
   src/x64-transpose/gen/2x2-multi-switch-sse2.c
-  src/x64-transpose/gen/2x2-reuse-dec-sse2.c
+  src/x64-transpose/gen/2x2-reuse-mov-sse2.c
   src/x64-transpose/gen/2x2-reuse-multi-sse2.c
   src/x64-transpose/gen/2x2-reuse-switch-sse2.c
   src/xx-fill/sse2-x64.c
diff --git a/bench/x16-transpose.cc b/bench/x16-transpose.cc
index 84de4b2..589597b 100644
--- a/bench/x16-transpose.cc
+++ b/bench/x16-transpose.cc
@@ -77,11 +77,11 @@
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(transpose, 8x8_sse, xnn_x16_transpose_ukernel__4x8_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
-  BENCHMARK_CAPTURE(transpose, 8x8_multi_dec_sse2, xnn_x16_transpose_ukernel__8x8_multi_dec_sse2)
+  BENCHMARK_CAPTURE(transpose, 8x8_multi_mov_sse2, xnn_x16_transpose_ukernel__8x8_multi_mov_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
   BENCHMARK_CAPTURE(transpose, 8x8_multi_switch_sse2, xnn_x16_transpose_ukernel__8x8_multi_switch_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
-  BENCHMARK_CAPTURE(transpose, 8x8_reuse_dec_sse2, xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2)
+  BENCHMARK_CAPTURE(transpose, 8x8_reuse_mov_sse2, xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
   BENCHMARK_CAPTURE(transpose, 8x8_reuse_multi_sse2, xnn_x16_transpose_ukernel__8x8_reuse_multi_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
diff --git a/bench/x32-transpose.cc b/bench/x32-transpose.cc
index 8e54ea5..a54bd3e 100644
--- a/bench/x32-transpose.cc
+++ b/bench/x32-transpose.cc
@@ -103,13 +103,13 @@
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(transpose, 4x4_sse, xnn_x32_transpose_ukernel__4x4_sse)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
-  BENCHMARK_CAPTURE(transpose, 4x4_multi_dec_sse2, xnn_x32_transpose_ukernel__4x4_multi_dec_sse2)
+  BENCHMARK_CAPTURE(transpose, 4x4_multi_mov_sse2, xnn_x32_transpose_ukernel__4x4_multi_mov_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
   BENCHMARK_CAPTURE(transpose, 4x4_multi_multi_sse2, xnn_x32_transpose_ukernel__4x4_multi_multi_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
   BENCHMARK_CAPTURE(transpose, 4x4_multi_switch_sse2, xnn_x32_transpose_ukernel__4x4_multi_switch_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
-  BENCHMARK_CAPTURE(transpose, 4x4_reuse_dec_sse2, xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2)
+  BENCHMARK_CAPTURE(transpose, 4x4_reuse_mov_sse2, xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
   BENCHMARK_CAPTURE(transpose, 4x4_reuse_multi_sse2, xnn_x32_transpose_ukernel__4x4_reuse_multi_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
diff --git a/bench/x64-transpose.cc b/bench/x64-transpose.cc
index e903798..79a1e5d 100644
--- a/bench/x64-transpose.cc
+++ b/bench/x64-transpose.cc
@@ -79,11 +79,11 @@
     ->Apply(BenchmarkKernelSize)->UseRealTime();
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  BENCHMARK_CAPTURE(transpose, 2x2_multi_dec_sse2, xnn_x64_transpose_ukernel__2x2_multi_dec_sse2)
+  BENCHMARK_CAPTURE(transpose, 2x2_multi_mov_sse2, xnn_x64_transpose_ukernel__2x2_multi_mov_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
   BENCHMARK_CAPTURE(transpose, 2x2_multi_switch_sse2, xnn_x64_transpose_ukernel__2x2_multi_switch_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
-  BENCHMARK_CAPTURE(transpose, 2x2_reuse_dec_sse2, xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2)
+  BENCHMARK_CAPTURE(transpose, 2x2_reuse_mov_sse2, xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
   BENCHMARK_CAPTURE(transpose, 2x2_reuse_multi_sse2, xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
diff --git a/bench/x8-transpose.cc b/bench/x8-transpose.cc
index 9f20f04..23b8651 100644
--- a/bench/x8-transpose.cc
+++ b/bench/x8-transpose.cc
@@ -75,7 +75,7 @@
     ->Apply(BenchmarkKernelSize)->UseRealTime();
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  BENCHMARK_CAPTURE(transpose, 16x16_reuse_dec_sse2, xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2)
+  BENCHMARK_CAPTURE(transpose, 16x16_reuse_mov_sse2, xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
   BENCHMARK_CAPTURE(transpose, 16x16_reuse_switch_sse2, xnn_x8_transpose_ukernel__16x16_reuse_switch_sse2)
       ->Apply(BenchmarkKernelSize)->UseRealTime();
diff --git a/scripts/generate-xN-transpose.sh b/scripts/generate-xN-transpose.sh
index adeecee..ba0e9e0 100755
--- a/scripts/generate-xN-transpose.sh
+++ b/scripts/generate-xN-transpose.sh
@@ -49,25 +49,25 @@
 tools/xngen src/x32-transpose/scalar.c.in -D TILE_HEIGHT=4 TILE_WIDTH=2 TYPE=double SIZE=64 -o src/x64-transpose/gen/4x2-scalar-float.c &
 
 #################################### SSE2 ###################################
-tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=DEC SIZE=8 -o src/x8-transpose/gen/16x16-reuse-dec-sse2.c &
+tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MOV SIZE=8 -o src/x8-transpose/gen/16x16-reuse-mov-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=SWITCH SIZE=8 -o src/x8-transpose/gen/16x16-reuse-switch-sse2.c &
-tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=DEC SIZE=16 -o src/x16-transpose/gen/8x8-reuse-dec-sse2.c &
+tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MOV SIZE=16 -o src/x16-transpose/gen/8x8-reuse-mov-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=SWITCH SIZE=16 -o src/x16-transpose/gen/8x8-reuse-switch-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MULTI SIZE=16 -o src/x16-transpose/gen/8x8-reuse-multi-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=SWITCH SIZE=16 -o src/x16-transpose/gen/8x8-multi-switch-sse2.c &
-tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=DEC SIZE=16 -o src/x16-transpose/gen/8x8-multi-dec-sse2.c &
-tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=DEC SIZE=32 -o src/x32-transpose/gen/4x4-reuse-dec-sse2.c &
+tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=MOV SIZE=16 -o src/x16-transpose/gen/8x8-multi-mov-sse2.c &
+tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MOV SIZE=32 -o src/x32-transpose/gen/4x4-reuse-mov-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=SWITCH SIZE=32 -o src/x32-transpose/gen/4x4-reuse-switch-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MULTI SIZE=32 -o src/x32-transpose/gen/4x4-reuse-multi-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=SWITCH SIZE=32 -o src/x32-transpose/gen/4x4-multi-switch-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=MULTI SIZE=32 -o src/x32-transpose/gen/4x4-multi-multi-sse2.c &
-tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=DEC SIZE=32 -o src/x32-transpose/gen/4x4-multi-dec-sse2.c &
-tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=DEC SIZE=64 -o src/x64-transpose/gen/2x2-reuse-dec-sse2.c &
+tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=MOV SIZE=32 -o src/x32-transpose/gen/4x4-multi-mov-sse2.c &
+tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MOV SIZE=64 -o src/x64-transpose/gen/2x2-reuse-mov-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=SWITCH SIZE=64 -o src/x64-transpose/gen/2x2-reuse-switch-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MULTI SIZE=64 -o src/x64-transpose/gen/2x2-reuse-multi-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=SWITCH SIZE=64 -o src/x64-transpose/gen/2x2-multi-switch-sse2.c &
 tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=MULTI SIZE=64 -o src/x64-transpose/gen/2x2-multi-multi-sse2.c &
-tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=DEC SIZE=64 -o src/x64-transpose/gen/2x2-multi-dec-sse2.c &
+tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=MOV SIZE=64 -o src/x64-transpose/gen/2x2-multi-mov-sse2.c &
 
 ################################## Unit tests #################################
 tools/generate-transpose-test.py --spec test/x8-transpose.yaml --output=test/x8-transpose.cc &
diff --git a/src/x16-transpose/gen/8x8-multi-dec-sse2.c b/src/x16-transpose/gen/8x8-multi-mov-sse2.c
similarity index 99%
rename from src/x16-transpose/gen/8x8-multi-dec-sse2.c
rename to src/x16-transpose/gen/8x8-multi-mov-sse2.c
index e23c1cc..a074c49 100644
--- a/src/x16-transpose/gen/8x8-multi-dec-sse2.c
+++ b/src/x16-transpose/gen/8x8-multi-mov-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 #include <xnnpack/transpose.h>
 
-void xnn_x16_transpose_ukernel__8x8_multi_dec_sse2(
+void xnn_x16_transpose_ukernel__8x8_multi_mov_sse2(
     const uint16_t* input,
     uint16_t* output,
     size_t input_stride,
diff --git a/src/x16-transpose/gen/8x8-reuse-dec-sse2.c b/src/x16-transpose/gen/8x8-reuse-mov-sse2.c
similarity index 99%
rename from src/x16-transpose/gen/8x8-reuse-dec-sse2.c
rename to src/x16-transpose/gen/8x8-reuse-mov-sse2.c
index f27e352..7e284de 100644
--- a/src/x16-transpose/gen/8x8-reuse-dec-sse2.c
+++ b/src/x16-transpose/gen/8x8-reuse-mov-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 #include <xnnpack/transpose.h>
 
-void xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2(
+void xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2(
     const uint16_t* input,
     uint16_t* output,
     size_t input_stride,
diff --git a/src/x32-transpose/gen/4x4-multi-dec-sse2.c b/src/x32-transpose/gen/4x4-multi-mov-sse2.c
similarity index 98%
rename from src/x32-transpose/gen/4x4-multi-dec-sse2.c
rename to src/x32-transpose/gen/4x4-multi-mov-sse2.c
index f8ae72d..4d4a969 100644
--- a/src/x32-transpose/gen/4x4-multi-dec-sse2.c
+++ b/src/x32-transpose/gen/4x4-multi-mov-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 #include <xnnpack/transpose.h>
 
-void xnn_x32_transpose_ukernel__4x4_multi_dec_sse2(
+void xnn_x32_transpose_ukernel__4x4_multi_mov_sse2(
     const uint32_t* input,
     uint32_t* output,
     size_t input_stride,
diff --git a/src/x32-transpose/gen/4x4-reuse-dec-sse2.c b/src/x32-transpose/gen/4x4-reuse-mov-sse2.c
similarity index 98%
rename from src/x32-transpose/gen/4x4-reuse-dec-sse2.c
rename to src/x32-transpose/gen/4x4-reuse-mov-sse2.c
index 1e9689e..61fa3ad 100644
--- a/src/x32-transpose/gen/4x4-reuse-dec-sse2.c
+++ b/src/x32-transpose/gen/4x4-reuse-mov-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 #include <xnnpack/transpose.h>
 
-void xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2(
+void xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2(
     const uint32_t* input,
     uint32_t* output,
     size_t input_stride,
diff --git a/src/x32-transpose/sse2.c.in b/src/x32-transpose/sse2.c.in
index b0a8fd8..ff52164 100644
--- a/src/x32-transpose/sse2.c.in
+++ b/src/x32-transpose/sse2.c.in
@@ -4,7 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 $import math
 $assert IN_PTRS in ["MULTI", "REUSE"]
-$assert OUT_PTRS in ["MULTI", "SWITCH", "DEC"]
+$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV"]
 $assert SIZE in [8, 16, 32, 64]
 $TILE_SIZE = int(128/SIZE)
 $NUM_ITERS = int(math.log2(TILE_SIZE))
@@ -61,7 +61,7 @@
         if XNN_UNPREDICTABLE(block_width < ${N+2}) {
           o${N+1} = o0;
         }
-    $if OUT_PTRS in ["SWITCH", "DEC"]:
+    $if OUT_PTRS in ["SWITCH", "MOV"]:
       const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
       const size_t oN_stride = rem * output_stride;
     size_t bh = block_height;
@@ -112,7 +112,7 @@
           default:
             XNN_UNREACHABLE;
         }
-      $elif OUT_PTRS == "DEC":
+      $elif OUT_PTRS == "MOV":
         uint${SIZE}_t* o${TILE_SIZE-1} = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
         _mm_storeu_si128((__m128i*) o${TILE_SIZE-1}, v0_${TILE_SIZE-1});
         $for N in reversed(range(2, TILE_SIZE, 2)):
@@ -208,7 +208,7 @@
           }
           $if NUM_ITERS > 1:
             o += ${TILE_SIZE>>1};
-        $elif OUT_PTRS == "DEC":
+        $elif OUT_PTRS == "MOV":
           uint${SIZE}_t* o${TILE_SIZE-1} = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
           _mm_storel_epi64((__m128i*) o${TILE_SIZE-1}, v0_${TILE_SIZE-1});
           $for N in reversed(range(2, TILE_SIZE, 2)):
@@ -254,7 +254,7 @@
             }
             $if NUM_ITERS > 2:
               o += ${TILE_SIZE>>2};
-          $elif OUT_PTRS == "DEC":
+          $elif OUT_PTRS == "MOV":
             uint${SIZE}_t* o${TILE_SIZE-1} = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
             *((int*) o${TILE_SIZE-1}) = _mm_cvtsi128_si32(v0_${TILE_SIZE-1});
             $for N in reversed(range(2, TILE_SIZE, 2)):
@@ -299,7 +299,7 @@
             }
             $if NUM_ITERS>3:
               o += ${TILE_SIZE>>3};
-          $elif OUT_PTRS == "DEC":
+          $elif OUT_PTRS == "MOV":
             uint${SIZE}_t* o${TILE_SIZE-1} = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
             *((uint16_t*) o${TILE_SIZE-1}) = (uint16_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1});
             $for N in reversed(range(2, TILE_SIZE, 2)):
@@ -342,7 +342,7 @@
               default:
                 XNN_UNREACHABLE;
             }
-          $elif OUT_PTRS == "DEC":
+          $elif OUT_PTRS == "MOV":
             uint${SIZE}_t* o${TILE_SIZE-1} = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
             *((uint8_t*) o${TILE_SIZE-1}) = (uint8_t) _mm_cvtsi128_si32(v0_${TILE_SIZE-1});
             $for N in reversed(range(2, TILE_SIZE, 2)):
diff --git a/src/x64-transpose/gen/2x2-multi-dec-sse2.c b/src/x64-transpose/gen/2x2-multi-mov-sse2.c
similarity index 97%
rename from src/x64-transpose/gen/2x2-multi-dec-sse2.c
rename to src/x64-transpose/gen/2x2-multi-mov-sse2.c
index a92dac6..930bbde 100644
--- a/src/x64-transpose/gen/2x2-multi-dec-sse2.c
+++ b/src/x64-transpose/gen/2x2-multi-mov-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 #include <xnnpack/transpose.h>
 
-void xnn_x64_transpose_ukernel__2x2_multi_dec_sse2(
+void xnn_x64_transpose_ukernel__2x2_multi_mov_sse2(
     const uint64_t* input,
     uint64_t* output,
     size_t input_stride,
diff --git a/src/x64-transpose/gen/2x2-reuse-dec-sse2.c b/src/x64-transpose/gen/2x2-reuse-mov-sse2.c
similarity index 97%
rename from src/x64-transpose/gen/2x2-reuse-dec-sse2.c
rename to src/x64-transpose/gen/2x2-reuse-mov-sse2.c
index d995c8d..5832cea 100644
--- a/src/x64-transpose/gen/2x2-reuse-dec-sse2.c
+++ b/src/x64-transpose/gen/2x2-reuse-mov-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 #include <xnnpack/transpose.h>
 
-void xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2(
+void xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2(
     const uint64_t* input,
     uint64_t* output,
     size_t input_stride,
diff --git a/src/x8-transpose/gen/16x16-reuse-dec-sse2.c b/src/x8-transpose/gen/16x16-reuse-mov-sse2.c
similarity index 99%
rename from src/x8-transpose/gen/16x16-reuse-dec-sse2.c
rename to src/x8-transpose/gen/16x16-reuse-mov-sse2.c
index 03d48de..1887e06 100644
--- a/src/x8-transpose/gen/16x16-reuse-dec-sse2.c
+++ b/src/x8-transpose/gen/16x16-reuse-mov-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 #include <xnnpack/transpose.h>
 
-void xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2(
+void xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2(
     const uint8_t* input,
     uint8_t* output,
     size_t input_stride,
diff --git a/src/xnnpack/transpose.h b/src/xnnpack/transpose.h
index bbc8b02..dcaba57 100644
--- a/src/xnnpack/transpose.h
+++ b/src/xnnpack/transpose.h
@@ -32,10 +32,10 @@
 DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__2x2_scalar_float)
 DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__4x1_scalar_float)
 DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__4x2_scalar_float)
-DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2)
+DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2)
 DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2)
 DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2)
-DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2)
+DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2)
 DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2)
 DECLARE_X64_TRANSPOSE_UKERNEL_FUNCTION(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2)
 
@@ -66,10 +66,10 @@
 DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl)
 DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_sse)
 DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_wasmsimd)
-DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2)
+DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2)
 DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_multi_multi_sse2)
 DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_multi_switch_sse2)
-DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2)
+DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2)
 DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_reuse_multi_sse2)
 DECLARE_X32_TRANSPOSE_UKERNEL_FUNCTION(xnn_x32_transpose_ukernel__4x4_reuse_switch_sse2)
 
@@ -90,9 +90,9 @@
 DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__4x2_scalar_int)
 DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__4x4_scalar_int)
 DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__4x8_sse2)
-DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2)
+DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2)
 DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__8x8_multi_switch_sse2)
-DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2)
+DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2)
 DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__8x8_reuse_multi_sse2)
 DECLARE_X16_TRANSPOSE_UKERNEL_FUNCTION(xnn_x16_transpose_ukernel__8x8_reuse_switch_sse2)
 
@@ -104,7 +104,7 @@
                             size_t block_width,         \
                             size_t block_height);
 
-DECLARE_X8_TRANSPOSE_UKERNEL_FUNCTION(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2)
+DECLARE_X8_TRANSPOSE_UKERNEL_FUNCTION(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2)
 DECLARE_X8_TRANSPOSE_UKERNEL_FUNCTION(xnn_x8_transpose_ukernel__16x16_reuse_switch_sse2)
 
 #define DECLARE_X8_TRANSPOSE_UKERNEL_FUNCTION(fn_name)  \
diff --git a/test/x16-transpose.cc b/test/x16-transpose.cc
index 9d17865..40ba987 100644
--- a/test/x16-transpose.cc
+++ b/test/x16-transpose.cc
@@ -1257,7 +1257,7 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_8_bw_8) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_8_bw_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -1265,10 +1265,10 @@
       .block_width(8)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_1_16_bw_1_16) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_1_16_bw_1_16) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 1; i <= 16; ++i){
       for(size_t j = 1; j <= 16; ++j){
@@ -1278,12 +1278,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+          .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
       }
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_8_bw_16) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_8_bw_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(16)
@@ -1291,10 +1291,10 @@
       .block_width(16)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_8_bw_9_16) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_8_bw_9_16) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       TransposeMicrokernelTester()
@@ -1303,11 +1303,11 @@
         .block_width(i)
         .block_height(8)
         .iterations(1)
-        .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+        .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_16_bw_9_16) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_16_bw_9_16) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       TransposeMicrokernelTester()
@@ -1316,11 +1316,11 @@
         .block_width(i)
         .block_height(16)
         .iterations(1)
-        .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+        .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_16_bw_8) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_16_bw_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -1328,10 +1328,10 @@
       .block_width(8)
       .block_height(16)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_9_16_bw_8){
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_9_16_bw_8){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       TransposeMicrokernelTester()
@@ -1340,11 +1340,11 @@
         .block_width(8)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+        .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_9_16_bw_16){
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_9_16_bw_16){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       TransposeMicrokernelTester()
@@ -1353,11 +1353,11 @@
         .block_width(16)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+        .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_9_16_bw_9_16) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_9_16_bw_9_16) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       for(size_t j = 9; j < 16; ++j){
@@ -1367,12 +1367,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+          .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
       }
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_8_bw_8_is_16) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_8_bw_8_is_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(16)
@@ -1380,10 +1380,10 @@
       .block_width(8)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_8_bw_8_os_16) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_8_bw_8_os_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -1391,10 +1391,10 @@
       .block_width(8)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_MULTI_DEC_SSE2, bh_8_bw_8_is_16_os_16) {
+  TEST(X16_TRANSPOSE__8X8_MULTI_MOV_SSE2, bh_8_bw_8_is_16_os_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(16)
@@ -1402,7 +1402,7 @@
       .block_width(8)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_multi_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_multi_mov_sse2);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -1559,7 +1559,7 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_8_bw_8) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_8_bw_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -1567,10 +1567,10 @@
       .block_width(8)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_1_16_bw_1_16) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_1_16_bw_1_16) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 1; i <= 16; ++i){
       for(size_t j = 1; j <= 16; ++j){
@@ -1580,12 +1580,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+          .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
       }
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_8_bw_16) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_8_bw_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(16)
@@ -1593,10 +1593,10 @@
       .block_width(16)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_8_bw_9_16) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_8_bw_9_16) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       TransposeMicrokernelTester()
@@ -1605,11 +1605,11 @@
         .block_width(i)
         .block_height(8)
         .iterations(1)
-        .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+        .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_16_bw_9_16) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_16_bw_9_16) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       TransposeMicrokernelTester()
@@ -1618,11 +1618,11 @@
         .block_width(i)
         .block_height(16)
         .iterations(1)
-        .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+        .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_16_bw_8) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_16_bw_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -1630,10 +1630,10 @@
       .block_width(8)
       .block_height(16)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_9_16_bw_8){
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_9_16_bw_8){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       TransposeMicrokernelTester()
@@ -1642,11 +1642,11 @@
         .block_width(8)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+        .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_9_16_bw_16){
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_9_16_bw_16){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       TransposeMicrokernelTester()
@@ -1655,11 +1655,11 @@
         .block_width(16)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+        .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_9_16_bw_9_16) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_9_16_bw_9_16) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 9; i < 16; ++i){
       for(size_t j = 9; j < 16; ++j){
@@ -1669,12 +1669,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+          .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
       }
     }
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_8_bw_8_is_16) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_8_bw_8_is_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(16)
@@ -1682,10 +1682,10 @@
       .block_width(8)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_8_bw_8_os_16) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_8_bw_8_os_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -1693,10 +1693,10 @@
       .block_width(8)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
   }
 
-  TEST(X16_TRANSPOSE__8X8_REUSE_DEC_SSE2, bh_8_bw_8_is_16_os_16) {
+  TEST(X16_TRANSPOSE__8X8_REUSE_MOV_SSE2, bh_8_bw_8_is_16_os_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(16)
@@ -1704,7 +1704,7 @@
       .block_width(8)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2);
+      .Test(xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
diff --git a/test/x16-transpose.yaml b/test/x16-transpose.yaml
index b3b06ac..d7c1109 100644
--- a/test/x16-transpose.yaml
+++ b/test/x16-transpose.yaml
@@ -12,8 +12,8 @@
 - name: xnn_x16_transpose_ukernel__4x2_scalar_int
 - name: xnn_x16_transpose_ukernel__4x4_scalar_int
 - name: xnn_x16_transpose_ukernel__4x8_sse2
-- name: xnn_x16_transpose_ukernel__8x8_multi_dec_sse2
+- name: xnn_x16_transpose_ukernel__8x8_multi_mov_sse2
 - name: xnn_x16_transpose_ukernel__8x8_multi_switch_sse2
-- name: xnn_x16_transpose_ukernel__8x8_reuse_dec_sse2
+- name: xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2
 - name: xnn_x16_transpose_ukernel__8x8_reuse_multi_sse2
 - name: xnn_x16_transpose_ukernel__8x8_reuse_switch_sse2
diff --git a/test/x32-transpose.cc b/test/x32-transpose.cc
index 057e618..e90df29 100644
--- a/test/x32-transpose.cc
+++ b/test/x32-transpose.cc
@@ -2345,7 +2345,7 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_4_bw_4) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_4_bw_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -2353,10 +2353,10 @@
       .block_width(4)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_1_8_bw_1_8) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_1_8_bw_1_8) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 1; i <= 8; ++i){
       for(size_t j = 1; j <= 8; ++j){
@@ -2366,12 +2366,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+          .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
       }
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_4_bw_8) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_4_bw_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -2379,10 +2379,10 @@
       .block_width(8)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_4_bw_5_8) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_4_bw_5_8) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       TransposeMicrokernelTester()
@@ -2391,11 +2391,11 @@
         .block_width(i)
         .block_height(4)
         .iterations(1)
-        .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+        .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_8_bw_5_8) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_8_bw_5_8) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       TransposeMicrokernelTester()
@@ -2404,11 +2404,11 @@
         .block_width(i)
         .block_height(8)
         .iterations(1)
-        .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+        .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_8_bw_4) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_8_bw_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -2416,10 +2416,10 @@
       .block_width(4)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_5_8_bw_4){
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_5_8_bw_4){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       TransposeMicrokernelTester()
@@ -2428,11 +2428,11 @@
         .block_width(4)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+        .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_5_8_bw_8){
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_5_8_bw_8){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       TransposeMicrokernelTester()
@@ -2441,11 +2441,11 @@
         .block_width(8)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+        .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_5_8_bw_5_8) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_5_8_bw_5_8) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       for(size_t j = 5; j < 8; ++j){
@@ -2455,12 +2455,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+          .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
       }
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_4_bw_4_is_8) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_4_bw_4_is_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -2468,10 +2468,10 @@
       .block_width(4)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_4_bw_4_os_8) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_4_bw_4_os_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -2479,10 +2479,10 @@
       .block_width(4)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_MULTI_DEC_SSE2, bh_4_bw_4_is_8_os_8) {
+  TEST(X32_TRANSPOSE__4X4_MULTI_MOV_SSE2, bh_4_bw_4_is_8_os_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -2490,7 +2490,7 @@
       .block_width(4)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_multi_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_multi_mov_sse2);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -2798,7 +2798,7 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_4_bw_4) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_4_bw_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -2806,10 +2806,10 @@
       .block_width(4)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_1_8_bw_1_8) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_1_8_bw_1_8) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 1; i <= 8; ++i){
       for(size_t j = 1; j <= 8; ++j){
@@ -2819,12 +2819,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+          .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
       }
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_4_bw_8) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_4_bw_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -2832,10 +2832,10 @@
       .block_width(8)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_4_bw_5_8) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_4_bw_5_8) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       TransposeMicrokernelTester()
@@ -2844,11 +2844,11 @@
         .block_width(i)
         .block_height(4)
         .iterations(1)
-        .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+        .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_8_bw_5_8) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_8_bw_5_8) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       TransposeMicrokernelTester()
@@ -2857,11 +2857,11 @@
         .block_width(i)
         .block_height(8)
         .iterations(1)
-        .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+        .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_8_bw_4) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_8_bw_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -2869,10 +2869,10 @@
       .block_width(4)
       .block_height(8)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_5_8_bw_4){
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_5_8_bw_4){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       TransposeMicrokernelTester()
@@ -2881,11 +2881,11 @@
         .block_width(4)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+        .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_5_8_bw_8){
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_5_8_bw_8){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       TransposeMicrokernelTester()
@@ -2894,11 +2894,11 @@
         .block_width(8)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+        .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_5_8_bw_5_8) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_5_8_bw_5_8) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 5; i < 8; ++i){
       for(size_t j = 5; j < 8; ++j){
@@ -2908,12 +2908,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+          .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
       }
     }
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_4_bw_4_is_8) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_4_bw_4_is_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -2921,10 +2921,10 @@
       .block_width(4)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_4_bw_4_os_8) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_4_bw_4_os_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -2932,10 +2932,10 @@
       .block_width(4)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
   }
 
-  TEST(X32_TRANSPOSE__4X4_REUSE_DEC_SSE2, bh_4_bw_4_is_8_os_8) {
+  TEST(X32_TRANSPOSE__4X4_REUSE_MOV_SSE2, bh_4_bw_4_is_8_os_8) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(8)
@@ -2943,7 +2943,7 @@
       .block_width(4)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2);
+      .Test(xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
diff --git a/test/x32-transpose.yaml b/test/x32-transpose.yaml
index dd4e2a7..53d9995 100644
--- a/test/x32-transpose.yaml
+++ b/test/x32-transpose.yaml
@@ -20,10 +20,10 @@
 - name: xnn_x32_transpose_ukernel__4x2_scalar_float
 - name: xnn_x32_transpose_ukernel__4x4_scalar_float
 - name: xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl
-- name: xnn_x32_transpose_ukernel__4x4_multi_dec_sse2
+- name: xnn_x32_transpose_ukernel__4x4_multi_mov_sse2
 - name: xnn_x32_transpose_ukernel__4x4_multi_multi_sse2
 - name: xnn_x32_transpose_ukernel__4x4_multi_switch_sse2
-- name: xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2
+- name: xnn_x32_transpose_ukernel__4x4_reuse_mov_sse2
 - name: xnn_x32_transpose_ukernel__4x4_reuse_multi_sse2
 - name: xnn_x32_transpose_ukernel__4x4_reuse_switch_sse2
 - name: xnn_x32_transpose_ukernel__4x4_sse
diff --git a/test/x64-transpose.cc b/test/x64-transpose.cc
index a3991f8..9d52acd 100644
--- a/test/x64-transpose.cc
+++ b/test/x64-transpose.cc
@@ -1378,7 +1378,7 @@
 }
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_2_bw_2) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_2) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(2)
@@ -1386,10 +1386,10 @@
       .block_width(2)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_1_4_bw_1_4) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_1_4_bw_1_4) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 1; i <= 4; ++i){
       for(size_t j = 1; j <= 4; ++j){
@@ -1399,12 +1399,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+          .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
       }
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_2_bw_4) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -1412,10 +1412,10 @@
       .block_width(4)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_2_bw_3_4) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_3_4) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       TransposeMicrokernelTester()
@@ -1424,11 +1424,11 @@
         .block_width(i)
         .block_height(2)
         .iterations(1)
-        .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+        .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_4_bw_3_4) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_4_bw_3_4) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       TransposeMicrokernelTester()
@@ -1437,11 +1437,11 @@
         .block_width(i)
         .block_height(4)
         .iterations(1)
-        .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+        .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_4_bw_2) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_4_bw_2) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(2)
@@ -1449,10 +1449,10 @@
       .block_width(2)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_3_4_bw_2){
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_3_4_bw_2){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       TransposeMicrokernelTester()
@@ -1461,11 +1461,11 @@
         .block_width(2)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+        .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_3_4_bw_4){
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_3_4_bw_4){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       TransposeMicrokernelTester()
@@ -1474,11 +1474,11 @@
         .block_width(4)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+        .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_3_4_bw_3_4) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_3_4_bw_3_4) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       for(size_t j = 3; j < 4; ++j){
@@ -1488,12 +1488,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+          .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
       }
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_2_bw_2_is_4) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_2_is_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -1501,10 +1501,10 @@
       .block_width(2)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_2_bw_2_os_4) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_2_os_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(2)
@@ -1512,10 +1512,10 @@
       .block_width(2)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_MULTI_DEC_SSE2, bh_2_bw_2_is_4_os_4) {
+  TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_2_is_4_os_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -1523,7 +1523,7 @@
       .block_width(2)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_multi_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -1831,7 +1831,7 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_2_bw_2) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_2) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(2)
@@ -1839,10 +1839,10 @@
       .block_width(2)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_1_4_bw_1_4) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_1_4_bw_1_4) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 1; i <= 4; ++i){
       for(size_t j = 1; j <= 4; ++j){
@@ -1852,12 +1852,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+          .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
       }
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_2_bw_4) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -1865,10 +1865,10 @@
       .block_width(4)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_2_bw_3_4) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_3_4) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       TransposeMicrokernelTester()
@@ -1877,11 +1877,11 @@
         .block_width(i)
         .block_height(2)
         .iterations(1)
-        .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+        .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_4_bw_3_4) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_4_bw_3_4) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       TransposeMicrokernelTester()
@@ -1890,11 +1890,11 @@
         .block_width(i)
         .block_height(4)
         .iterations(1)
-        .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+        .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_4_bw_2) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_4_bw_2) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(2)
@@ -1902,10 +1902,10 @@
       .block_width(2)
       .block_height(4)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_3_4_bw_2){
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_3_4_bw_2){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       TransposeMicrokernelTester()
@@ -1914,11 +1914,11 @@
         .block_width(2)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+        .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_3_4_bw_4){
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_3_4_bw_4){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       TransposeMicrokernelTester()
@@ -1927,11 +1927,11 @@
         .block_width(4)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+        .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_3_4_bw_3_4) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_3_4_bw_3_4) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 3; i < 4; ++i){
       for(size_t j = 3; j < 4; ++j){
@@ -1941,12 +1941,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+          .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
       }
     }
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_2_bw_2_is_4) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_2_is_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -1954,10 +1954,10 @@
       .block_width(2)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_2_bw_2_os_4) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_2_os_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(2)
@@ -1965,10 +1965,10 @@
       .block_width(2)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
   }
 
-  TEST(X64_TRANSPOSE__2X2_REUSE_DEC_SSE2, bh_2_bw_2_is_4_os_4) {
+  TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_2_is_4_os_4) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(4)
@@ -1976,7 +1976,7 @@
       .block_width(2)
       .block_height(2)
       .iterations(1)
-      .Test(xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2);
+      .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
diff --git a/test/x64-transpose.yaml b/test/x64-transpose.yaml
index d00246f..6caf955 100644
--- a/test/x64-transpose.yaml
+++ b/test/x64-transpose.yaml
@@ -14,9 +14,9 @@
 - name: xnn_x64_transpose_ukernel__2x2_scalar_float
 - name: xnn_x64_transpose_ukernel__4x1_scalar_float
 - name: xnn_x64_transpose_ukernel__4x2_scalar_float
-- name: xnn_x64_transpose_ukernel__2x2_multi_dec_sse2
+- name: xnn_x64_transpose_ukernel__2x2_multi_mov_sse2
 - name: xnn_x64_transpose_ukernel__2x2_multi_multi_sse2
 - name: xnn_x64_transpose_ukernel__2x2_multi_switch_sse2
-- name: xnn_x64_transpose_ukernel__2x2_reuse_dec_sse2
+- name: xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2
 - name: xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2
-- name: xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2
\ No newline at end of file
+- name: xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2
diff --git a/test/x8-transpose.cc b/test/x8-transpose.cc
index 6489f67..3f0c1b5 100644
--- a/test/x8-transpose.cc
+++ b/test/x8-transpose.cc
@@ -1106,7 +1106,7 @@
 }
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_16_bw_16) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_16_bw_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(16)
@@ -1114,10 +1114,10 @@
       .block_width(16)
       .block_height(16)
       .iterations(1)
-      .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+      .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_1_32_bw_1_32) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_1_32_bw_1_32) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 1; i <= 32; ++i){
       for(size_t j = 1; j <= 32; ++j){
@@ -1127,12 +1127,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+          .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
       }
     }
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_16_bw_32) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_16_bw_32) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(32)
@@ -1140,10 +1140,10 @@
       .block_width(32)
       .block_height(16)
       .iterations(1)
-      .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+      .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_16_bw_17_32) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_16_bw_17_32) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 17; i < 32; ++i){
       TransposeMicrokernelTester()
@@ -1152,11 +1152,11 @@
         .block_width(i)
         .block_height(16)
         .iterations(1)
-        .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+        .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
     }
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_32_bw_17_32) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_32_bw_17_32) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 17; i < 32; ++i){
       TransposeMicrokernelTester()
@@ -1165,11 +1165,11 @@
         .block_width(i)
         .block_height(32)
         .iterations(1)
-        .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+        .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
     }
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_32_bw_16) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_32_bw_16) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(16)
@@ -1177,10 +1177,10 @@
       .block_width(16)
       .block_height(32)
       .iterations(1)
-      .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+      .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_17_32_bw_16){
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_17_32_bw_16){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 17; i < 32; ++i){
       TransposeMicrokernelTester()
@@ -1189,11 +1189,11 @@
         .block_width(16)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+        .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
     }
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_17_32_bw_32){
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_17_32_bw_32){
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 17; i < 32; ++i){
       TransposeMicrokernelTester()
@@ -1202,11 +1202,11 @@
         .block_width(32)
         .block_height(i)
         .iterations(1)
-        .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+        .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
     }
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_17_32_bw_17_32) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_17_32_bw_17_32) {
     TEST_REQUIRES_X86_SSE2;
     for(size_t i = 17; i < 32; ++i){
       for(size_t j = 17; j < 32; ++j){
@@ -1216,12 +1216,12 @@
           .block_width(j)
           .block_height(i)
           .iterations(1)
-          .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+          .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
       }
     }
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_16_bw_16_is_32) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_16_bw_16_is_32) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(32)
@@ -1229,10 +1229,10 @@
       .block_width(16)
       .block_height(16)
       .iterations(1)
-      .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+      .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_16_bw_16_os_32) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_16_bw_16_os_32) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(16)
@@ -1240,10 +1240,10 @@
       .block_width(16)
       .block_height(16)
       .iterations(1)
-      .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+      .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
   }
 
-  TEST(X8_TRANSPOSE__16X16_REUSE_DEC_SSE2, bh_16_bw_16_is_32_os_32) {
+  TEST(X8_TRANSPOSE__16X16_REUSE_MOV_SSE2, bh_16_bw_16_is_32_os_32) {
     TEST_REQUIRES_X86_SSE2;
     TransposeMicrokernelTester()
       .input_stride(32)
@@ -1251,7 +1251,7 @@
       .block_width(16)
       .block_height(16)
       .iterations(1)
-      .Test(xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2);
+      .Test(xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
diff --git a/test/x8-transpose.yaml b/test/x8-transpose.yaml
index 731c6d5..50c3e79 100644
--- a/test/x8-transpose.yaml
+++ b/test/x8-transpose.yaml
@@ -11,5 +11,5 @@
 - name: xnn_x8_transpose_ukernel__4x1_scalar_int
 - name: xnn_x8_transpose_ukernel__4x2_scalar_int
 - name: xnn_x8_transpose_ukernel__4x4_scalar_int
-- name: xnn_x8_transpose_ukernel__16x16_reuse_dec_sse2
-- name: xnn_x8_transpose_ukernel__16x16_reuse_switch_sse2
\ No newline at end of file
+- name: xnn_x8_transpose_ukernel__16x16_reuse_mov_sse2
+- name: xnn_x8_transpose_ukernel__16x16_reuse_switch_sse2