ARM: Combine multiply accumulate operations.
Try to combine integer multiply and add(sub) into a MAC operation.
For AArch64, also try to combine long type multiply and add(sub).
Change-Id: Ic85812e941eb5a66abc355cab81a4dd16de1b66e
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index fc96075..f78b39f 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -426,6 +426,10 @@
if (bb->block_type == kDead) {
return true;
}
+ // Currently multiply-accumulate backend supports are only available on arm32 and arm64.
+ if (cu_->instruction_set == kArm64 || cu_->instruction_set == kThumb2) {
+ MultiplyAddOpt(bb);
+ }
bool use_lvn = bb->use_lvn && (cu_->disable_opt & (1u << kLocalValueNumbering)) == 0u;
std::unique_ptr<ScopedArenaAllocator> allocator;
std::unique_ptr<GlobalValueNumbering> global_valnum;
@@ -1709,4 +1713,205 @@
temp_.sce.inliner = nullptr;
}
+bool MIRGraph::CanThrow(MIR* mir) {
+ if ((mir->dalvikInsn.FlagsOf() & Instruction::kThrow) == 0) {
+ return false;
+ }
+ const int opt_flags = mir->optimization_flags;
+ uint64_t df_attributes = GetDataFlowAttributes(mir);
+
+ if (((df_attributes & DF_HAS_NULL_CHKS) != 0) && ((opt_flags & MIR_IGNORE_NULL_CHECK) == 0)) {
+ return true;
+ }
+ if ((df_attributes & DF_IFIELD) != 0) {
+ // The IGET/IPUT family.
+ const MirIFieldLoweringInfo& field_info = GetIFieldLoweringInfo(mir);
+ bool fast = (df_attributes & DF_DA) ? field_info.FastGet() : field_info.FastPut();
+ // Already processed null check above.
+ if (fast) {
+ return false;
+ }
+ } else if ((df_attributes & DF_HAS_RANGE_CHKS) != 0) {
+ // The AGET/APUT family.
+ // Already processed null check above.
+ if ((opt_flags & MIR_IGNORE_RANGE_CHECK) != 0) {
+ return false;
+ }
+ } else if ((df_attributes & DF_SFIELD) != 0) {
+ // The SGET/SPUT family.
+ const MirSFieldLoweringInfo& field_info = GetSFieldLoweringInfo(mir);
+ bool fast = (df_attributes & DF_DA) ? field_info.FastGet() : field_info.FastPut();
+ bool is_class_initialized = field_info.IsClassInitialized() ||
+ ((mir->optimization_flags & MIR_CLASS_IS_INITIALIZED) != 0);
+ if (fast && is_class_initialized) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool MIRGraph::HasAntiDependency(MIR* first, MIR* second) {
+ DCHECK(first->ssa_rep != nullptr);
+ DCHECK(second->ssa_rep != nullptr);
+ if ((second->ssa_rep->num_defs > 0) && (first->ssa_rep->num_uses > 0)) {
+ int vreg0 = SRegToVReg(second->ssa_rep->defs[0]);
+ int vreg1 = (second->ssa_rep->num_defs == 2) ?
+ SRegToVReg(second->ssa_rep->defs[1]) : INVALID_VREG;
+ for (int i = 0; i < first->ssa_rep->num_uses; i++) {
+ int32_t use = SRegToVReg(first->ssa_rep->uses[i]);
+ if (use == vreg0 || use == vreg1) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void MIRGraph::CombineMultiplyAdd(MIR* mul_mir, MIR* add_mir, bool mul_is_first_addend,
+ bool is_wide, bool is_sub) {
+ if (is_wide) {
+ if (is_sub) {
+ add_mir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpMsubLong);
+ } else {
+ add_mir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpMaddLong);
+ }
+ } else {
+ if (is_sub) {
+ add_mir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpMsubInt);
+ } else {
+ add_mir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpMaddInt);
+ }
+ }
+ add_mir->ssa_rep->num_uses = is_wide ? 6 : 3;
+ int32_t addend0 = INVALID_SREG;
+ int32_t addend1 = INVALID_SREG;
+ if (is_wide) {
+ addend0 = mul_is_first_addend ? add_mir->ssa_rep->uses[2] : add_mir->ssa_rep->uses[0];
+ addend1 = mul_is_first_addend ? add_mir->ssa_rep->uses[3] : add_mir->ssa_rep->uses[1];
+ } else {
+ addend0 = mul_is_first_addend ? add_mir->ssa_rep->uses[1] : add_mir->ssa_rep->uses[0];
+ }
+
+ AllocateSSAUseData(add_mir, add_mir->ssa_rep->num_uses);
+ add_mir->ssa_rep->uses[0] = mul_mir->ssa_rep->uses[0];
+ add_mir->ssa_rep->uses[1] = mul_mir->ssa_rep->uses[1];
+ // Clear the original multiply product ssa use count, as it is not used anymore.
+ raw_use_counts_[mul_mir->ssa_rep->defs[0]] = 0;
+ use_counts_[mul_mir->ssa_rep->defs[0]] = 0;
+ if (is_wide) {
+ DCHECK_EQ(add_mir->ssa_rep->num_uses, 6);
+ add_mir->ssa_rep->uses[2] = mul_mir->ssa_rep->uses[2];
+ add_mir->ssa_rep->uses[3] = mul_mir->ssa_rep->uses[3];
+ add_mir->ssa_rep->uses[4] = addend0;
+ add_mir->ssa_rep->uses[5] = addend1;
+ raw_use_counts_[mul_mir->ssa_rep->defs[1]] = 0;
+ use_counts_[mul_mir->ssa_rep->defs[1]] = 0;
+ } else {
+ DCHECK_EQ(add_mir->ssa_rep->num_uses, 3);
+ add_mir->ssa_rep->uses[2] = addend0;
+ }
+ // Copy in the decoded instruction information.
+ add_mir->dalvikInsn.vB = SRegToVReg(add_mir->ssa_rep->uses[0]);
+ if (is_wide) {
+ add_mir->dalvikInsn.vC = SRegToVReg(add_mir->ssa_rep->uses[2]);
+ add_mir->dalvikInsn.arg[0] = SRegToVReg(add_mir->ssa_rep->uses[4]);
+ } else {
+ add_mir->dalvikInsn.vC = SRegToVReg(add_mir->ssa_rep->uses[1]);
+ add_mir->dalvikInsn.arg[0] = SRegToVReg(add_mir->ssa_rep->uses[2]);
+ }
+ // Original multiply MIR is set to Nop.
+ mul_mir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+}
+
+void MIRGraph::MultiplyAddOpt(BasicBlock* bb) {
+ if (bb->block_type == kDead) {
+ return;
+ }
+ ScopedArenaAllocator allocator(&cu_->arena_stack);
+ ScopedArenaSafeMap<uint32_t, MIR*> ssa_mul_map(std::less<uint32_t>(), allocator.Adapter());
+ ScopedArenaSafeMap<uint32_t, MIR*>::iterator map_it;
+ for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
+ Instruction::Code opcode = mir->dalvikInsn.opcode;
+ bool is_sub = true;
+ bool is_candidate_multiply = false;
+ switch (opcode) {
+ case Instruction::MUL_INT:
+ case Instruction::MUL_INT_2ADDR:
+ is_candidate_multiply = true;
+ break;
+ case Instruction::MUL_LONG:
+ case Instruction::MUL_LONG_2ADDR:
+ if (cu_->target64) {
+ is_candidate_multiply = true;
+ }
+ break;
+ case Instruction::ADD_INT:
+ case Instruction::ADD_INT_2ADDR:
+ is_sub = false;
+ FALLTHROUGH_INTENDED;
+ case Instruction::SUB_INT:
+ case Instruction::SUB_INT_2ADDR:
+ if (((map_it = ssa_mul_map.find(mir->ssa_rep->uses[0])) != ssa_mul_map.end()) && !is_sub) {
+ // a*b+c
+ CombineMultiplyAdd(map_it->second, mir, true /* product is the first addend */,
+ false /* is_wide */, false /* is_sub */);
+ ssa_mul_map.erase(mir->ssa_rep->uses[0]);
+ } else if ((map_it = ssa_mul_map.find(mir->ssa_rep->uses[1])) != ssa_mul_map.end()) {
+ // c+a*b or c-a*b
+ CombineMultiplyAdd(map_it->second, mir, false /* product is the second addend */,
+ false /* is_wide */, is_sub);
+ ssa_mul_map.erase(map_it);
+ }
+ break;
+ case Instruction::ADD_LONG:
+ case Instruction::ADD_LONG_2ADDR:
+ is_sub = false;
+ FALLTHROUGH_INTENDED;
+ case Instruction::SUB_LONG:
+ case Instruction::SUB_LONG_2ADDR:
+ if (!cu_->target64) {
+ break;
+ }
+ if ((map_it = ssa_mul_map.find(mir->ssa_rep->uses[0])) != ssa_mul_map.end() && !is_sub) {
+ // a*b+c
+ CombineMultiplyAdd(map_it->second, mir, true /* product is the first addend */,
+ true /* is_wide */, false /* is_sub */);
+ ssa_mul_map.erase(map_it);
+ } else if ((map_it = ssa_mul_map.find(mir->ssa_rep->uses[2])) != ssa_mul_map.end()) {
+ // c+a*b or c-a*b
+ CombineMultiplyAdd(map_it->second, mir, false /* product is the second addend */,
+ true /* is_wide */, is_sub);
+ ssa_mul_map.erase(map_it);
+ }
+ break;
+ default:
+ if (!ssa_mul_map.empty() && CanThrow(mir)) {
+ // Should not combine multiply and add MIRs across potential exception.
+ ssa_mul_map.clear();
+ }
+ break;
+ }
+
+ // Exclude the case when an MIR writes a vreg which is previous candidate multiply MIR's uses.
+ // It is because that current RA may allocate the same physical register to them. For this
+ // kind of cases, the multiplier has been updated, we should not use updated value to the
+ // multiply-add insn.
+ if (ssa_mul_map.size() > 0) {
+ for (auto it = ssa_mul_map.begin(); it != ssa_mul_map.end();) {
+ MIR* mul = it->second;
+ if (HasAntiDependency(mul, mir)) {
+ it = ssa_mul_map.erase(it);
+ } else {
+ ++it;
+ }
+ }
+ }
+
+ if (is_candidate_multiply &&
+ (GetRawUseCount(mir->ssa_rep->defs[0]) == 1) && (mir->next != nullptr)) {
+ ssa_mul_map.Put(mir->ssa_rep->defs[0], mir);
+ }
+ }
+}
+
} // namespace art