Replace generic shuffle with narrow instructions in WAsm SIMD QS8/QU8/QC8 microkernels Narrow instructions lower better on x86-64 than the shuffles they replaced: End2End benchmark on Xeon W-2135: - QS8MobileNetV1: 56184 us -> 55796 us - QS8MobileNetV2: 37840 us -> 37362 us - QU8MobileNetV1: 81489 us -> 80863 us - QU8MobileNetV2: 51605 us -> 50872 us PiperOrigin-RevId: 389565236

commit: 07706f6976c1de0f72b23dc49fc18386f5c4494b [log] [tgz]
author: Marat Dukhan <maratek@google.com> Sun Aug 08 23:48:40 2021 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> Sun Aug 08 23:49:23 2021 -0700
tree: 8b348630b414c8b29a991294e674656c8dd4b3a2
parent: dfc2db03388eb29adade291cc4e0efe6652b0472 [diff] [blame]
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
index 8bde857..895f8b7 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c

@@ -114,9 +114,9 @@
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
 
-    v128_t vacc00x0123 = wasm_v16x8_shuffle(vacc0x0123, vacc0x0123, 0, 2, 4, 6, 8, 10, 12, 14);
+    v128_t vacc00x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x0123);
 
-    v128_t vout = wasm_v8x16_shuffle(vacc00x0123, vacc00x0123, 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14);
+    v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123);
 
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
commit	07706f6976c1de0f72b23dc49fc18386f5c4494b	[log] [tgz]
author	Marat Dukhan <maratek@google.com>	Sun Aug 08 23:48:40 2021 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Sun Aug 08 23:49:23 2021 -0700
tree	8b348630b414c8b29a991294e674656c8dd4b3a2
parent	dfc2db03388eb29adade291cc4e0efe6652b0472 [diff] [blame]