Reoptimize LUT-based SSE2 Exp evaluation stub

Pre-compute shift for 4 values with SIMD instruction

PiperOrigin-RevId: 332801193
diff --git a/src/math/exp-neonfma-lut64-p2.c b/src/math/exp-neonfma-lut64-p2.c
index c9e13e3..6885f6c 100644
--- a/src/math/exp-neonfma-lut64-p2.c
+++ b/src/math/exp-neonfma-lut64-p2.c
@@ -71,8 +71,8 @@
     const uint64_t vidx23 = vgetq_lane_u64(vidx, 1);
     float32x2_t vl01 = vld1_dup_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) vidx01));
     float32x2_t vl23 = vld1_dup_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) vidx23));
-    vl01 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (vidx01 >> 32)), vl01, 1);
-    vl23 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (vidx23 >> 32)), vl23, 1);
+    vl01 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) (vidx01 >> 32)), vl01, 1);
+    vl23 = vld1_lane_f32((const float*) ((uintptr_t) xnn_table_exp2_k_over_64 + (uint32_t) (vidx23 >> 32)), vl23, 1);
     float32x4_t vl = vcombine_f32(vl01, vl23);
     // Fuse so into the value l fetched from a table by adjusting its exponential.
     vl = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), veo));