SSE transpose microkernel

- New microkernel
- Unit tests
- Benchmarks

PiperOrigin-RevId: 416515720
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 42ca9c5..8b653b8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3199,7 +3199,8 @@
   src/math/sqrt-sse-hh1mac.c
   src/math/sqrt-sse-nr1mac.c
   src/math/sqrt-sse-nr2mac.c
-  src/x32-packx/x4-sse.c)
+  src/x32-packx/x4-sse.c
+  src/x32-transpose/4x4-sse.c)
 
 SET(PROD_SSE2_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-sse2-int16-x32.c
@@ -7556,6 +7557,15 @@
   TARGET_LINK_LIBRARIES(x32-depthtospace2d-chw2hwc-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
   ADD_TEST(x32-depthtospace2d-chw2hwc-test x32-depthtospace2d-chw2hwc-test)
 
+  ADD_EXECUTABLE(x32-transpose-test test/x32-transpose.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(x32-transpose-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(x32-transpose-test PRIVATE include src test)
+  TARGET_LINK_LIBRARIES(x32-transpose-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
+  ADD_TEST(x32-transpose-test x32-transpose-test)
+
   ADD_EXECUTABLE(x32-zip-test test/x32-zip.cc $<TARGET_OBJECTS:all_microkernels>)
   SET_TARGET_PROPERTIES(x32-zip-test PROPERTIES
     CXX_STANDARD 11
@@ -8309,4 +8319,12 @@
     CXX_EXTENSIONS YES)
   TARGET_INCLUDE_DIRECTORIES(x8-lut-bench PRIVATE . include src)
   TARGET_LINK_LIBRARIES(x8-lut-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
+
+  ADD_EXECUTABLE(x32-transpose-bench bench/x32-transpose.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(x32-transpose-bench PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(x32-transpose-bench PRIVATE . include src)
+  TARGET_LINK_LIBRARIES(x32-transpose-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
 ENDIF()