[ARM][ParallelDSP] Enable multiple uses of loads
When choosing whether a pair of loads can be combined into a single
wide load, we check that the load only has a sext user and that sext
also only has one user. But this can prevent the transformation in
the cases when parallel macs use the same loaded data multiple times.
To enable this, we need to fix up any other uses after creating the
wide load: generating a trunc and a shift + trunc pair to recreate
the narrow values. We also need to keep a record of which loads have
already been widened.
Differential Revision: https://reviews.llvm.org/D59215
llvm-svn: 356132
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
new file mode 100644
index 0000000..3c19068
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
@@ -0,0 +1,251 @@
+; RUN: llc -O3 -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s | FileCheck %s
+
+; CHECK-LABEL: add_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxtah [[COUNT:r[0-9]+]], [[COUNT]], [[A]]
+define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %count.next = add i32 %conv4, %count
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_bottom_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: mul [[COUNT:r[0-9]+]], [[SXT]], [[COUNT]]
+define i32 @mul_bottom_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %count.next = mul i32 %conv4, %count
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_top_user
+; CHECK: %for.body
+; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: asr.w [[ASR:[rl0-9]+]], [[ASR]], #16
+; CHECK: mul [[COUNT:[rl0-9]+]], [[ASR]], [[COUNT]]
+define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %mul = mul nsw i32 %conv, %conv4
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %count.next = mul i32 %conv7, %count
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: and_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: uxth [[UXT:r[0-9]+]], [[A]]
+; CHECK: mul [[MUL:r[0-9]+]], [[UXT]], [[MUL]]
+define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %bottom = and i32 %conv4, 65535
+ %mul = mul nsw i32 %conv, %conv4
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %add11 = add i32 %mul9, %add10
+ %count.next = mul i32 %bottom, %count
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: multi_uses
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: eor.w [[EOR:r[0-9]+]], [[SXT]], [[SHIFT:r[0-9]+]]
+; CHECK: mul [[MUL:r[0-9]+]], [[EOR]], [[SXT]]
+; CHECK: lsl.w [[SHIFT]], [[MUL]], #16
+define i32 @multi_uses(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+ %cmp24 = icmp sgt i32 %arg, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %.pre = load i16, i16* %arg3, align 2
+ %.pre27 = load i16, i16* %arg2, align 2
+ br label %for.body
+
+for.cond.cleanup:
+ %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+ %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+ %res = add i32 %mac1.0.lcssa, %count.final
+ ret i32 %res
+
+for.body:
+ %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+ %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+ %0 = load i16, i16* %arrayidx, align 2
+ %add = add nuw nsw i32 %i.025, 1
+ %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+ %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+ %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+ %1 = load i16, i16* %arrayidx1, align 2
+ %2 = load i16, i16* %arrayidx3, align 2
+ %conv = sext i16 %2 to i32
+ %conv4 = sext i16 %0 to i32
+ %bottom = and i32 %conv4, 65535
+ %mul = mul nsw i32 %conv, %conv4
+ %3 = load i16, i16* %arrayidx6, align 2
+ %conv7 = sext i16 %3 to i32
+ %conv8 = sext i16 %1 to i32
+ %mul9 = mul nsw i32 %conv7, %conv8
+ %add10 = add i32 %mul, %mac1.026
+ %shl = shl i32 %conv4, 16
+ %add11 = add i32 %mul9, %add10
+ %xor = xor i32 %bottom, %count
+ %count.next = mul i32 %xor, %shl
+ %exitcond = icmp ne i32 %add, %arg
+ br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}