PR14972: SROA vs. GVN exposed a really bad bug in SROA.

The fundamental problem is that SROA didn't allow for overly wide loads
where the bits past the end of the alloca were masked away and the load
was sufficiently aligned to ensure there is no risk of page fault, or
other trapping behavior. With such widened loads, SROA would delete the
load entirely rather than clamping it to the size of the alloca in order
to allow mem2reg to fire. This was exposed by a test case that neatly
arranged for GVN to run first, widening certain loads, followed by an
inline step, and then SROA which miscompiles the code. However, I see no
reason why this hasn't been plaguing us in other contexts. It seems
deeply broken.

Diagnosing all of the above took all of 10 minutes of debugging. The
really annoying aspect is that fixing this completely breaks the pass.
;] There was an implicit reliance on the fact that no loads or stores
extended past the alloca once we decided to rewrite them in the final
stage of SROA. This was used to encode information about whether the
loads and stores had been split across multiple partitions of the
original alloca. That required threading explicit tracking of whether
a *use* of a partition is split across multiple partitions.

Once that was done, another problem arose: we allowed splitting of
integer loads and stores iff they were loads and stores to the entire
alloca. This is a really arbitrary limitation, and splitting at least
some integer loads and stores is crucial to maximize promotion
opportunities. My first attempt was to start removing the restriction
entirely, but currently that does Very Bad Things by causing *many*
common alloca patterns to be fully decomposed into i8 operations and
lots of or-ing together to produce larger integers on demand. The code
bloat is terrifying. That is still the right end-goal, but substantial
work must be done to either merge partitions or ensure that small i8
values are eagerly merged in some other pass. Sadly, figuring all this
out took essentially all the time and effort here.

So the end result is that we allow splitting only when the load or store
at least covers the alloca. That ensures widened loads and stores don't
hurt SROA, and that we don't rampantly decompose operations more than we
have previously.

All of this was already fairly well tested, and so I've just updated the
tests to cover the wide load behavior. I can add a test that crafts the
pass ordering magic which caused the original PR, but that seems really
brittle and to provide little benefit. The fundamental problem is that
widened loads should Just Work.

llvm-svn: 177055
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index efc01ac..30dd217 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -500,14 +500,27 @@
 
 define i64 @test9() {
 ; Ensure we can handle loads off the end of an alloca even when wrapped in
-; weird bit casts and types. The result is undef, but this shouldn't crash
-; anything.
+; weird bit casts and types. This is valid IR due to the alignment and masking
+; off the bits past the end of the alloca.
+;
 ; CHECK: @test9
 ; CHECK-NOT: alloca
-; CHECK: ret i64 undef
+; CHECK:      %[[b2:.*]] = zext i8 26 to i64
+; CHECK-NEXT: %[[s2:.*]] = shl i64 %[[b2]], 16
+; CHECK-NEXT: %[[m2:.*]] = and i64 undef, -16711681
+; CHECK-NEXT: %[[i2:.*]] = or i64 %[[m2]], %[[s2]]
+; CHECK-NEXT: %[[b1:.*]] = zext i8 0 to i64
+; CHECK-NEXT: %[[s1:.*]] = shl i64 %[[b1]], 8
+; CHECK-NEXT: %[[m1:.*]] = and i64 %[[i2]], -65281
+; CHECK-NEXT: %[[i1:.*]] = or i64 %[[m1]], %[[s1]]
+; CHECK-NEXT: %[[b0:.*]] = zext i8 0 to i64
+; CHECK-NEXT: %[[m0:.*]] = and i64 %[[i1]], -256
+; CHECK-NEXT: %[[i0:.*]] = or i64 %[[m0]], %[[b0]]
+; CHECK-NEXT: %[[result:.*]] = and i64 %[[i0]], 16777215
+; CHECK-NEXT: ret i64 %[[result]]
 
 entry:
-  %a = alloca { [3 x i8] }
+  %a = alloca { [3 x i8] }, align 8
   %gep1 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 0
   store i8 0, i8* %gep1, align 1
   %gep2 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 1
@@ -516,7 +529,8 @@
   store i8 26, i8* %gep3, align 1
   %cast = bitcast { [3 x i8] }* %a to { i64 }*
   %elt = getelementptr inbounds { i64 }* %cast, i32 0, i32 0
-  %result = load i64* %elt
+  %load = load i64* %elt
+  %result = and i64 %load, 16777215
   ret i64 %result
 }
 
@@ -617,11 +631,12 @@
 ; Ensure we don't crash and handle undefined loads that straddle the end of the
 ; allocation.
 ; CHECK: @test13
-; CHECK: %[[ret:.*]] = zext i16 undef to i32
-; CHECK: ret i32 %[[ret]]
+; CHECK:      %[[value:.*]] = zext i8 0 to i16
+; CHECK-NEXT: %[[ret:.*]] = zext i16 %[[value]] to i32
+; CHECK-NEXT: ret i32 %[[ret]]
 
 entry:
-  %a = alloca [3 x i8]
+  %a = alloca [3 x i8], align 2
   %b0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
   store i8 0, i8* %b0ptr
   %b1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
@@ -1160,20 +1175,25 @@
 entry:
   %a = alloca <{ i1 }>, align 8
   %b = alloca <{ i1 }>, align 8
-; Nothing of interest is simplified here.
-; CHECK: alloca
-; CHECK: alloca
+; CHECK:      %[[a:.*]] = alloca i8, align 8
 
   %b.i1 = bitcast <{ i1 }>* %b to i1*
   store i1 %x, i1* %b.i1, align 8
   %b.i8 = bitcast <{ i1 }>* %b to i8*
   %foo = load i8* %b.i8, align 1
+; CHECK-NEXT: {{.*}} = zext i1 %x to i8
+; CHECK-NEXT: %[[ext:.*]] = zext i1 %x to i8
+; CHECK-NEXT: store i8 %[[ext]], i8* %[[a]], align 8
+; CHECK-NEXT: {{.*}} = load i8* %[[a]], align 8
 
   %a.i8 = bitcast <{ i1 }>* %a to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.i8, i8* %b.i8, i32 1, i32 1, i1 false) nounwind
   %bar = load i8* %a.i8, align 1
   %a.i1 = getelementptr inbounds <{ i1 }>* %a, i32 0, i32 0
   %baz = load i1* %a.i1, align 1
+; CHECK-NEXT: %[[a_cast:.*]] = bitcast i8* %[[a]] to i1*
+; CHECK-NEXT: {{.*}} = load i1* %[[a_cast]], align 8
+
   ret void
 }
 
diff --git a/llvm/test/Transforms/SROA/phi-and-select.ll b/llvm/test/Transforms/SROA/phi-and-select.ll
index 921016a..b993180 100644
--- a/llvm/test/Transforms/SROA/phi-and-select.ll
+++ b/llvm/test/Transforms/SROA/phi-and-select.ll
@@ -396,9 +396,10 @@
 ; Here we form a PHI-node by promoting the pointer alloca first, and then in
 ; order to promote the other two allocas, we speculate the load of the
 ; now-phi-node-pointer. In doing so we end up loading a 64-bit value from an i8
-; alloca, which is completely bogus. However, we were asserting on trying to
-; rewrite it. Now it is replaced with undef. Eventually we may replace it with
-; unrechable and even the CFG will go away here.
+; alloca. While this is a bit dubious, we were asserting on trying to
+; rewrite it. The trick is that the code using the value may carefully take
+; steps to only use the not-undef bits, and so we need to at least loosely
+; support this..
 entry:
   %a = alloca i64
   %b = alloca i8
@@ -414,13 +415,14 @@
 if.then:
   store i8* %b, i8** %ptr.cast
   br label %if.end
+; CHECK-NOT: store
+; CHECK: %[[ext:.*]] = zext i8 1 to i64
 
 if.end:
   %tmp = load i64** %ptr
   %result = load i64* %tmp
-; CHECK-NOT: store
 ; CHECK-NOT: load
-; CHECK: %[[result:.*]] = phi i64 [ undef, %if.then ], [ 0, %entry ]
+; CHECK: %[[result:.*]] = phi i64 [ %[[ext]], %if.then ], [ 0, %entry ]
 
   ret i64 %result
 ; CHECK-NEXT: ret i64 %[[result]]