Reoptimize LUT-based NEONFMA Exp/ExpMinus evaluation stubs Pre-compute shift for 4 values with SIMD instruction PiperOrigin-RevId: 332798855

commit: 272139e6f496d492c94a35887e01e0fe1435b3c9 [log] [tgz]
author: Marat Dukhan <maratek@google.com> Mon Sep 21 01:01:31 2020 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> Mon Sep 21 01:02:05 2020 -0700
tree: 52d05e9b796624fc6b6027f7516cb00e6dc5e3cb
parent: d08abcb12f536f72bba97cbeade6662cd33e3741 [diff] [blame]
diff --git a/src/math/exp-neonfma-lut64-p2.c b/src/math/exp-neonfma-lut64-p2.c
index f4f6ee8..c9e13e3 100644
--- a/src/math/exp-neonfma-lut64-p2.c
+++ b/src/math/exp-neonfma-lut64-p2.c

@@ -66,13 +66,13 @@
     const float32x4_t vsn = vreinterpretq_f32_s32(vaddq_s32(ven, vdefault_exponent));
 
     // Use the low 6 bits of n (as integer) for table lookup.
-    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2));
     const uint64_t vidx01 = vgetq_lane_u64(vidx, 0);
     const uint64_t vidx23 = vgetq_lane_u64(vidx, 1);
-    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]);
-    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]);
-    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1);
-    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    float32x2_t vl01 = vld1_dup_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) vidx01));
+    float32x2_t vl23 = vld1_dup_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) vidx23));
+    vl01 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (vidx01 >> 32)), vl01, 1);
+    vl23 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (vidx23 >> 32)), vl23, 1);
     float32x4_t vl = vcombine_f32(vl01, vl23);
     // Fuse so into the value l fetched from a table by adjusting its exponential.
     vl = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), veo));
commit	272139e6f496d492c94a35887e01e0fe1435b3c9	[log] [tgz]
author	Marat Dukhan <maratek@google.com>	Mon Sep 21 01:01:31 2020 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Mon Sep 21 01:02:05 2020 -0700
tree	52d05e9b796624fc6b6027f7516cb00e6dc5e3cb
parent	d08abcb12f536f72bba97cbeade6662cd33e3741 [diff] [blame]