ART: Implement FP packed reduce for x86
This patch implements correct FP vector reduction by index.
Previous implementation corresponded to packed add reduction.
Change-Id: I02a9bcb8e8945937ba7a511b723f23ec30667d34
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 5f6cdda..230f611 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -2209,18 +2209,36 @@
// Handle float case.
// TODO Add support for fast math (not value safe) and do horizontal add in that case.
+ int extract_index = mir->dalvikInsn.arg[0];
+
rl_result = EvalLoc(rl_dest, kFPReg, true);
NewLIR2(kX86PxorRR, rl_result.reg.GetReg(), rl_result.reg.GetReg());
- NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
- // Since FP must keep order of operation for value safety, we shift to low
- // 32-bits and add to result.
- for (int i = 0; i < 3; i++) {
- NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39);
+ if (LIKELY(extract_index != 0)) {
+ // We know the index of element which we want to extract. We want to extract it and
+ // keep values in vector register correct for future use. So the way we act is:
+ // 1. Generate shuffle mask that allows to swap zeroth and required elements;
+ // 2. Shuffle vector register with this mask;
+ // 3. Extract zeroth element where required value lies;
+ // 4. Shuffle with same mask again to restore original values in vector register.
+ // The mask is generated from equivalence mask 0b11100100 swapping 0th and extracted
+ // element indices.
+ int shuffle[4] = {0b00, 0b01, 0b10, 0b11};
+ shuffle[0] = extract_index;
+ shuffle[extract_index] = 0;
+ int mask = 0;
+ for (int i = 0; i < 4; i++) {
+ mask |= (shuffle[i] << (2 * i));
+ }
+ NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), mask);
+ NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+ NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), mask);
+ } else {
+ // We need to extract zeroth element and don't need any complex stuff to do it.
NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
}
- StoreValue(rl_dest, rl_result);
+ StoreFinalValue(rl_dest, rl_result);
} else if (opsize == kDouble) {
// TODO Handle double case.
LOG(FATAL) << "Unsupported add reduce for double.";