Reoptimize LUT-based SSE2 Exp evaluation stub Pre-compute shift for 4 values with SIMD instruction PiperOrigin-RevId: 332801193

commit: b32b018c4e2beaf1b0df28c6553d21aac3059291 [log] [tgz]
author: Marat Dukhan <maratek@google.com> Mon Sep 21 01:21:04 2020 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> Mon Sep 21 01:21:39 2020 -0700
tree: c70719b9630db6ab2828510092d2bf80d3e92079
parent: 272139e6f496d492c94a35887e01e0fe1435b3c9 [diff] [blame]
diff --git a/src/math/exp-neonfma-lut64-p2.c b/src/math/exp-neonfma-lut64-p2.c
index c9e13e3..6885f6c 100644
--- a/src/math/exp-neonfma-lut64-p2.c
+++ b/src/math/exp-neonfma-lut64-p2.c

@@ -71,8 +71,8 @@
     const uint64_t vidx23 = vgetq_lane_u64(vidx, 1);
     float32x2_t vl01 = vld1_dup_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) vidx01));
     float32x2_t vl23 = vld1_dup_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) vidx23));
-    vl01 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (vidx01 >> 32)), vl01, 1);
-    vl23 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (vidx23 >> 32)), vl23, 1);
+    vl01 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) (vidx01 >> 32)), vl01, 1);
+    vl23 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) (vidx23 >> 32)), vl23, 1);
     float32x4_t vl = vcombine_f32(vl01, vl23);
     // Fuse so into the value l fetched from a table by adjusting its exponential.
     vl = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), veo));
commit	b32b018c4e2beaf1b0df28c6553d21aac3059291	[log] [tgz]
author	Marat Dukhan <maratek@google.com>	Mon Sep 21 01:21:04 2020 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Mon Sep 21 01:21:39 2020 -0700
tree	c70719b9630db6ab2828510092d2bf80d3e92079
parent	272139e6f496d492c94a35887e01e0fe1435b3c9 [diff] [blame]