[SDAG] Make the DAGCombine worklist not grow endlessly due to duplicate
insertions.
The old behavior could cause arbitrarily bad memory usage in the DAG
combiner if there was heavy traffic of adding nodes already on the
worklist to it. This commit switches the DAG combine worklist to work
the same way as the instcombine worklist where we null-out removed
entries and only add new entries to the worklist. My measurements of
codegen time shows slight improvement. The memory utilization is
unsurprisingly dominated by other factors (the IR and DAG itself
I suspect).
This change results in subtle, frustrating churn in the particular order
in which DAG combines are applied which causes a number of minor
regressions where we fail to match a pattern previously matched by
accident. AFAICT, all of these should be using AddToWorklist to directly
or should be written in a less brittle way. None of the changes seem
drastically bad, and a few of the changes seem distinctly better.
A major change required to make this work is to significantly harden the
way in which the DAG combiner handle nodes which become dead
(zero-uses). Previously, we relied on the ability to "priority-bump"
them on the combine worklist to achieve recursive deletion of these
nodes and ensure that the frontier of remaining live nodes all were
added to the worklist. Instead, I've introduced a routine to just
implement that precise logic with no indirection. It is a significantly
simpler operation than that of the combiner worklist proper. I suspect
this will also fix some other problems with the combiner.
I think the x86 changes are really minor and uninteresting, but the
avx512 change at least is hiding a "regression" (despite the test case
being just noise, not testing some performance invariant) that might be
looked into. Not sure if any of the others impact specific "important"
code paths, but they didn't look terribly interesting to me, or the
changes were really minor. The consensus in review is to fix any
regressions that show up after the fact here.
Thanks to the other reviewers for checking the output on other
architectures. There is a specific regression on ARM that Tim already
has a fix prepped to commit.
Differential Revision: http://reviews.llvm.org/D4616
llvm-svn: 213727
diff --git a/llvm/test/CodeGen/ARM/fold-stack-adjust.ll b/llvm/test/CodeGen/ARM/fold-stack-adjust.ll
index eb0120f..1d10464 100644
--- a/llvm/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/llvm/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -167,9 +167,9 @@
define void @test_varsize(...) minsize {
; CHECK-T1-LABEL: test_varsize:
; CHECK-T1: sub sp, #16
-; CHECK-T1: push {r2, r3, r4, r5, r7, lr}
+; CHECK-T1: push {r5, r6, r7, lr}
; ...
-; CHECK-T1: pop {r2, r3, r4, r5, r7}
+; CHECK-T1: pop {r2, r3, r7}
; CHECK-T1: pop {r3}
; CHECK-T1: add sp, #16
; CHECK-T1: bx r3
diff --git a/llvm/test/CodeGen/ARM/sxt_rot.ll b/llvm/test/CodeGen/ARM/sxt_rot.ll
index 5ddea2e..4162691 100644
--- a/llvm/test/CodeGen/ARM/sxt_rot.ll
+++ b/llvm/test/CodeGen/ARM/sxt_rot.ll
@@ -9,7 +9,8 @@
define signext i8 @test1(i32 %A) {
; CHECK: test1
-; CHECK: sxtb r0, r0, ror #8
+; CHECK: lsr r0, r0, #8
+; CHECK: sxtb r0, r0
%B = lshr i32 %A, 8
%C = shl i32 %A, 24
%D = or i32 %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/complex-return.ll b/llvm/test/CodeGen/PowerPC/complex-return.ll
index 8a6adae..9d25e61 100644
--- a/llvm/test/CodeGen/PowerPC/complex-return.ll
+++ b/llvm/test/CodeGen/PowerPC/complex-return.ll
@@ -24,10 +24,10 @@
}
; CHECK-LABEL: foo:
-; CHECK: lfd 3
-; CHECK: lfd 4
; CHECK: lfd 1
; CHECK: lfd 2
+; CHECK: lfd 3
+; CHECK: lfd 4
define { float, float } @oof() nounwind {
entry:
diff --git a/llvm/test/CodeGen/PowerPC/subsumes-pred-regs.ll b/llvm/test/CodeGen/PowerPC/subsumes-pred-regs.ll
index da637cd..c510e36 100644
--- a/llvm/test/CodeGen/PowerPC/subsumes-pred-regs.ll
+++ b/llvm/test/CodeGen/PowerPC/subsumes-pred-regs.ll
@@ -35,7 +35,7 @@
br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49
; CHECK: .LBB0_7:
-; CHECK: beq 1, .LBB0_10
+; CHECK: bne 1, .LBB0_10
; CHECK: beq 0, .LBB0_10
; CHECK: .LBB0_9:
diff --git a/llvm/test/CodeGen/R600/r600-export-fix.ll b/llvm/test/CodeGen/R600/r600-export-fix.ll
index 73bc063..7d72856 100644
--- a/llvm/test/CodeGen/R600/r600-export-fix.ll
+++ b/llvm/test/CodeGen/R600/r600-export-fix.ll
@@ -3,9 +3,9 @@
;CHECK: EXPORT T{{[0-9]}}.XYZW
;CHECK: EXPORT T{{[0-9]}}.0000
;CHECK: EXPORT T{{[0-9]}}.0000
-;CHECK: EXPORT T{{[0-9]}}.0XZW
+;CHECK: EXPORT T{{[0-9]}}.0XYZ
;CHECK: EXPORT T{{[0-9]}}.XYZW
-;CHECK: EXPORT T{{[0-9]}}.YX00
+;CHECK: EXPORT T{{[0-9]}}.YZ00
;CHECK: EXPORT T{{[0-9]}}.0000
;CHECK: EXPORT T{{[0-9]}}.0000
diff --git a/llvm/test/CodeGen/R600/swizzle-export.ll b/llvm/test/CodeGen/R600/swizzle-export.ll
index 16c3f19..0a68f76 100644
--- a/llvm/test/CodeGen/R600/swizzle-export.ll
+++ b/llvm/test/CodeGen/R600/swizzle-export.ll
@@ -94,7 +94,7 @@
; EG-CHECK: @main2
; EG-CHECK: T{{[0-9]+}}.XY__
-; EG-CHECK: T{{[0-9]+}}.YXZ0
+; EG-CHECK: T{{[0-9]+}}.ZXY0
define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
main_body:
diff --git a/llvm/test/CodeGen/Thumb2/thumb2-sxt_rot.ll b/llvm/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
index cef3490..5e0977e 100644
--- a/llvm/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
@@ -10,7 +10,8 @@
define signext i8 @test1(i32 %A) {
; CHECK: test1
-; CHECK: sxtb.w r0, r0, ror #8
+; CHECK: lsrs r0, r0, #8
+; CHECK: sxtb r0, r0
%B = lshr i32 %A, 8
%C = shl i32 %A, 24
%D = or i32 %B, %C
diff --git a/llvm/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/llvm/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
index bcd4a0f..06e78d5 100644
--- a/llvm/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
@@ -25,7 +25,7 @@
define zeroext i32 @test3(i32 %A.u) {
; A8: test3
-; A8: uxth.w r0, r0, ror #8
+; A8: ubfx r0, r0, #8, #16
%B.u = lshr i32 %A.u, 8
%C.u = shl i32 %A.u, 24
%D.u = or i32 %B.u, %C.u
diff --git a/llvm/test/CodeGen/X86/avx512-zext-load-crash.ll b/llvm/test/CodeGen/X86/avx512-zext-load-crash.ll
deleted file mode 100644
index 07ded13..0000000
--- a/llvm/test/CodeGen/X86/avx512-zext-load-crash.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-
-define <8 x i16> @test_zext_load() {
- ; CHECK: vmovq
-entry:
- %0 = load <2 x i16> ** undef, align 8
- %1 = getelementptr inbounds <2 x i16>* %0, i64 1
- %2 = load <2 x i16>* %0, align 1
- %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %4 = load <2 x i16>* %1, align 1
- %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x i16> %6
-}
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index 2681c10..cc40bcf 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -237,44 +237,6 @@
ret i32 %base
}
-define void @test_loop_rotate_reversed_blocks() {
-; This test case (greatly reduced from an Olden bencmark) ensures that the loop
-; rotate implementation doesn't assume that loops are laid out in a particular
-; order. The first loop will get split into two basic blocks, with the loop
-; header coming after the loop latch.
-;
-; CHECK: test_loop_rotate_reversed_blocks
-; CHECK: %entry
-; Look for a jump into the middle of the loop, and no branches mid-way.
-; CHECK: jmp
-; CHECK: %loop1
-; CHECK-NOT: j{{\w*}} .LBB{{.*}}
-; CHECK: %loop1
-; CHECK: je
-
-entry:
- %cond1 = load volatile i1* undef
- br i1 %cond1, label %loop2.preheader, label %loop1
-
-loop1:
- call i32 @f()
- %cond2 = load volatile i1* undef
- br i1 %cond2, label %loop2.preheader, label %loop1
-
-loop2.preheader:
- call i32 @f()
- %cond3 = load volatile i1* undef
- br i1 %cond3, label %exit, label %loop2
-
-loop2:
- call i32 @f()
- %cond4 = load volatile i1* undef
- br i1 %cond4, label %exit, label %loop2
-
-exit:
- ret void
-}
-
define i32 @test_loop_align(i32 %i, i32* %a) {
; Check that we provide basic loop body alignment with the block placement
; pass.
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 21225e3..fd07a3f 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -31,6 +31,7 @@
; CHECK-LABEL: test3:
; CHECK: movzbl 8(%esp), %eax
; CHECK-NEXT: imull $171, %eax
+; CHECK-NEXT: andl $65024, %eax
; CHECK-NEXT: shrl $9, %eax
; CHECK-NEXT: ret
}
@@ -56,9 +57,10 @@
%div = sdiv i16 %x, 10
ret i16 %div
; CHECK-LABEL: test6:
-; CHECK: imull $26215, %eax, %ecx
-; CHECK: sarl $18, %ecx
-; CHECK: shrl $15, %eax
+; CHECK: imull $26215, %eax
+; CHECK: movl %eax, %ecx
+; CHECK: shrl $31, %ecx
+; CHECK: sarl $18, %eax
}
define i32 @test7(i32 %x) nounwind {
diff --git a/llvm/test/CodeGen/X86/fold-pcmpeqd-0.ll b/llvm/test/CodeGen/X86/fold-pcmpeqd-0.ll
deleted file mode 100644
index 1d315ff..0000000
--- a/llvm/test/CodeGen/X86/fold-pcmpeqd-0.ll
+++ /dev/null
@@ -1,117 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s
-; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s
-
-; i386 test has been disabled when scheduler 2-addr hack is disabled.
-
-; This testcase shouldn't need to spill the -1 value,
-; so it should just use pcmpeqd to materialize an all-ones vector.
-; For i386, cp load of -1 are folded.
-
-; With -regalloc=greedy, the live range is split before spilling, so the first
-; pcmpeq doesn't get folded as a constant pool load.
-
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-
-; X86-64: pcmpeqd
-; X86-64-NOT: pcmpeqd
-
- %struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }>
- %struct._cl_image_format_t = type <{ i32, i32, i32 }>
- %struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }>
-
-define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind {
-entry:
- %tmp3.i = load i32* null ; <i32> [#uses=1]
- %cmp = icmp sgt i32 %tmp3.i, 200 ; <i1> [#uses=1]
- br i1 %cmp, label %forcond, label %ifthen
-
-ifthen: ; preds = %entry
- ret void
-
-forcond: ; preds = %entry
- %tmp3.i536 = load i32* null ; <i32> [#uses=1]
- %cmp12 = icmp slt i32 0, %tmp3.i536 ; <i1> [#uses=1]
- br i1 %cmp12, label %forbody, label %afterfor
-
-forbody: ; preds = %forcond
- %bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]
- %mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer ; <<4 x float>> [#uses=1]
- %mul257 = fmul <4 x float> %mul233, zeroinitializer ; <<4 x float>> [#uses=1]
- %mul275 = fmul <4 x float> %mul257, zeroinitializer ; <<4 x float>> [#uses=1]
- %tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
- %bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]
- %bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]
- %tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind ; <<4 x i32>> [#uses=1]
- %tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind ; <<4 x float>> [#uses=1]
- %sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70 ; <<4 x float>> [#uses=2]
- %mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78 ; <<4 x float>> [#uses=1]
- %add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 > ; <<4 x float>> [#uses=1]
- %mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78 ; <<4 x float>> [#uses=1]
- %add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]
- %bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer ; <<4 x i32>> [#uses=1]
- %bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float> ; <<4 x float>> [#uses=1]
- %mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer ; <<4 x float>> [#uses=1]
- %bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer ; <<4 x i32>> [#uses=1]
- %xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
- %orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102 ; <<4 x i32>> [#uses=1]
- %bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float> ; <<4 x float>> [#uses=1]
- %cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind ; <<4 x float>> [#uses=1]
- %tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
- %sub140.i = fsub <4 x float> zeroinitializer, %tmp80 ; <<4 x float>> [#uses=1]
- %bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 > ; <<4 x i32>> [#uses=0]
- %mul171.i = fmul <4 x float> zeroinitializer, %sub140.i ; <<4 x float>> [#uses=1]
- %add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]
- %bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer ; <<4 x i32>> [#uses=1]
- %bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float> ; <<4 x float>> [#uses=1]
- %mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer ; <<4 x float>> [#uses=1]
- %bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]
- %bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer ; <<4 x i32>> [#uses=1]
- %bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32> ; <<4 x i32>> [#uses=1]
- %xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
- %orps203.i = or <4 x i32> %andnps192.i, %xorps.i ; <<4 x i32>> [#uses=1]
- %bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float> ; <<4 x float>> [#uses=1]
- %mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer ; <<4 x float>> [#uses=1]
- %mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer ; <<4 x float>> [#uses=2]
- %mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer ; <<4 x float>> [#uses=1]
- %tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
- %bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer ; <<4 x i32>> [#uses=1]
- %orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer ; <<4 x i32>> [#uses=1]
- %bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float> ; <<4 x float>> [#uses=1]
- %tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
- %bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=2]
- %andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4 ; <<4 x i32>> [#uses=1]
- %bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
- %andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7 ; <<4 x i32>> [#uses=1]
- %orps.i9 = or <4 x i32> %andnps.i8, %andps.i5 ; <<4 x i32>> [#uses=1]
- %bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float> ; <<4 x float>> [#uses=1]
- %bitcast.i = bitcast <4 x float> %mul313 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %andps.i = and <4 x i32> %bitcast.i, zeroinitializer ; <<4 x i32>> [#uses=1]
- %orps.i = or <4 x i32> zeroinitializer, %andps.i ; <<4 x i32>> [#uses=1]
- %bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float> ; <<4 x float>> [#uses=1]
- call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
- unreachable
-
-afterfor: ; preds = %forcond
- ret void
-}
-
-declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
-
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/narrow-shl-load.ll b/llvm/test/CodeGen/X86/narrow-shl-load.ll
index 30387925..5175bfc 100644
--- a/llvm/test/CodeGen/X86/narrow-shl-load.ll
+++ b/llvm/test/CodeGen/X86/narrow-shl-load.ll
@@ -30,40 +30,6 @@
ret void
}
-
-; DAGCombiner shouldn't fold the sdiv (ashr) away.
-; rdar://8636812
-; CHECK-LABEL: test2:
-; CHECK: sarl
-
-define i32 @test2() nounwind {
-entry:
- %i = alloca i32, align 4
- %j = alloca i8, align 1
- store i32 127, i32* %i, align 4
- store i8 0, i8* %j, align 1
- %tmp3 = load i32* %i, align 4
- %mul = mul nsw i32 %tmp3, 2
- %conv4 = trunc i32 %mul to i8
- %conv5 = sext i8 %conv4 to i32
- %div6 = sdiv i32 %conv5, 2
- %conv7 = trunc i32 %div6 to i8
- %conv9 = sext i8 %conv7 to i32
- %cmp = icmp eq i32 %conv9, -1
- br i1 %cmp, label %if.then, label %if.end
-
-if.then: ; preds = %entry
- ret i32 0
-
-if.end: ; preds = %entry
- call void @abort() noreturn
- unreachable
-}
-
-declare void @abort() noreturn
-
-declare void @exit(i32) noreturn
-
; DAG Combiner can't fold this into a load of the 1'th byte.
; PR8757
define i32 @test3(i32 *%P) nounwind ssp {
diff --git a/llvm/test/CodeGen/X86/store-narrow.ll b/llvm/test/CodeGen/X86/store-narrow.ll
index 7557f25..51f6fb0 100644
--- a/llvm/test/CodeGen/X86/store-narrow.ll
+++ b/llvm/test/CodeGen/X86/store-narrow.ll
@@ -34,8 +34,8 @@
; X64: movb %sil, 1(%rdi)
; X32-LABEL: test2:
-; X32: movb 8(%esp), %[[REG:[abcd]l]]
-; X32: movb %[[REG]], 1(%{{.*}})
+; X32: movzbl 8(%esp), %e[[REG:[abcd]]]x
+; X32: movb %[[REG]]l, 1(%{{.*}})
}
define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
diff --git a/llvm/test/CodeGen/X86/vec_extract-sse4.ll b/llvm/test/CodeGen/X86/vec_extract-sse4.ll
index 747c8a8..51f7a98 100644
--- a/llvm/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-sse4.ll
@@ -4,8 +4,8 @@
; CHECK-LABEL: @t1
; CHECK: movl 4(%esp), %[[R0:e[abcd]x]]
; CHECK-NEXT: movl 8(%esp), %[[R1:e[abcd]x]]
-; CHECK-NEXT: movl 12(%[[R1]]), %[[R2:e[abcd]x]]
-; CHECK-NEXT: movl %[[R2]], (%[[R0]])
+; CHECK-NEXT: movss 12(%[[R1]]), %[[R2:xmm.*]]
+; CHECK-NEXT: movss %[[R2]], (%[[R0]])
; CHECK-NEXT: retl
%X = load <4 x float>* %P1