| Benjamin Kramer | f016258 | 2013-10-12 11:17:12 +0000 | [diff] [blame] | 1 | ; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS | 
|  | 2 | ; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR | 
| Quentin Colombet | de0e062 | 2013-10-11 18:29:42 +0000 | [diff] [blame] | 3 | ; | 
|  | 4 | ; <rdar://problem/14477220> | 
|  | 5 |  | 
|  | 6 | %class.Complex = type { float, float } | 
|  | 7 |  | 
|  | 8 |  | 
| Alp Toker | cb40291 | 2014-01-24 17:20:08 +0000 | [diff] [blame] | 9 | ; Check that independent slices leads to independent loads then the slices leads to | 
| Quentin Colombet | de0e062 | 2013-10-11 18:29:42 +0000 | [diff] [blame] | 10 | ; different register file. | 
|  | 11 | ; | 
|  | 12 | ; The layout is: | 
|  | 13 | ; LSB 0 1 2 3 | 4 5 6 7 MSB | 
|  | 14 | ;       Low      High | 
|  | 15 | ; The base address points to 0 and is 8-bytes aligned. | 
|  | 16 | ; Low slice starts at 0 (base) and is 8-bytes aligned. | 
|  | 17 | ; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned. | 
|  | 18 | ; | 
|  | 19 | ; STRESS-LABEL: t1: | 
| Quentin Colombet | de0e062 | 2013-10-11 18:29:42 +0000 | [diff] [blame] | 20 | ; Load out[out_start + 8].real, this is base + 8 * 8 + 0. | 
| Andrew Trick | e97d8d6 | 2013-10-15 23:33:07 +0000 | [diff] [blame] | 21 | ; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]] | 
| Quentin Colombet | de0e062 | 2013-10-11 18:29:42 +0000 | [diff] [blame] | 22 | ; Add low slice: out[out_start].real, this is base + 0. | 
|  | 23 | ; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] | 
| Andrew Trick | e97d8d6 | 2013-10-15 23:33:07 +0000 | [diff] [blame] | 24 | ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. | 
|  | 25 | ; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]] | 
|  | 26 | ; Add high slice: out[out_start].imm, this is base + 4. | 
|  | 27 | ; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] | 
| Quentin Colombet | de0e062 | 2013-10-11 18:29:42 +0000 | [diff] [blame] | 28 | ; Swap Imm and Real. | 
|  | 29 | ; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] | 
|  | 30 | ; Put the results back into out[out_start]. | 
|  | 31 | ; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]]) | 
|  | 32 | ; | 
|  | 33 | ; Same for REGULAR, we eliminate register bank copy with each slices. | 
|  | 34 | ; REGULAR-LABEL: t1: | 
| Quentin Colombet | de0e062 | 2013-10-11 18:29:42 +0000 | [diff] [blame] | 35 | ; Load out[out_start + 8].real, this is base + 8 * 8 + 0. | 
| Andrew Trick | e97d8d6 | 2013-10-15 23:33:07 +0000 | [diff] [blame] | 36 | ; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]] | 
| Quentin Colombet | de0e062 | 2013-10-11 18:29:42 +0000 | [diff] [blame] | 37 | ; Add low slice: out[out_start].real, this is base + 0. | 
|  | 38 | ; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] | 
| Andrew Trick | e97d8d6 | 2013-10-15 23:33:07 +0000 | [diff] [blame] | 39 | ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. | 
|  | 40 | ; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]] | 
|  | 41 | ; Add high slice: out[out_start].imm, this is base + 4. | 
|  | 42 | ; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] | 
| Quentin Colombet | de0e062 | 2013-10-11 18:29:42 +0000 | [diff] [blame] | 43 | ; Swap Imm and Real. | 
|  | 44 | ; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] | 
|  | 45 | ; Put the results back into out[out_start]. | 
|  | 46 | ; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]]) | 
|  | 47 | define void @t1(%class.Complex* nocapture %out, i64 %out_start) { | 
|  | 48 | entry: | 
|  | 49 | %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start | 
|  | 50 | %tmp = bitcast %class.Complex* %arrayidx to i64* | 
|  | 51 | %tmp1 = load i64* %tmp, align 8 | 
|  | 52 | %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32 | 
|  | 53 | %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float | 
|  | 54 | %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32 | 
|  | 55 | %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32 | 
|  | 56 | %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float | 
|  | 57 | %add = add i64 %out_start, 8 | 
|  | 58 | %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add | 
|  | 59 | %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0 | 
|  | 60 | %tmp4 = load float* %i.i, align 4 | 
|  | 61 | %add.i = fadd float %tmp4, %tmp2 | 
|  | 62 | %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0 | 
|  | 63 | %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1 | 
|  | 64 | %tmp5 = load float* %r.i, align 4 | 
|  | 65 | %add5.i = fadd float %tmp5, %tmp3 | 
|  | 66 | %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1 | 
|  | 67 | %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>* | 
|  | 68 | store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4 | 
|  | 69 | ret void | 
|  | 70 | } | 
|  | 71 |  | 
|  | 72 | ; Function Attrs: nounwind | 
|  | 73 | declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 | 
|  | 74 |  | 
|  | 75 | ; Function Attrs: nounwind | 
|  | 76 | declare void @llvm.lifetime.start(i64, i8* nocapture) | 
|  | 77 |  | 
|  | 78 | ; Function Attrs: nounwind | 
|  | 79 | declare void @llvm.lifetime.end(i64, i8* nocapture) | 
|  | 80 |  | 
|  | 81 | ; Check that we do not read outside of the chunk of bits of the original loads. | 
|  | 82 | ; | 
|  | 83 | ; The 64-bits should have been split in one 32-bits and one 16-bits slices. | 
|  | 84 | ; The 16-bits should be zero extended to match the final type. | 
|  | 85 | ; | 
|  | 86 | ; The memory layout is: | 
|  | 87 | ; LSB 0 1 2 3 | 4 5 | 6 7 MSB | 
|  | 88 | ;      Low            High | 
|  | 89 | ; The base address points to 0 and is 8-bytes aligned. | 
|  | 90 | ; Low slice starts at 0 (base) and is 8-bytes aligned. | 
|  | 91 | ; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned. | 
|  | 92 | ; | 
|  | 93 | ; STRESS-LABEL: t2: | 
|  | 94 | ; STRESS: movzwl 6([[BASE:[^)]+]]), %eax | 
|  | 95 | ; STRESS-NEXT: addl ([[BASE]]), %eax | 
|  | 96 | ; STRESS-NEXT: ret | 
|  | 97 | ; | 
|  | 98 | ; For the REGULAR heuristic, this is not profitable to slice things that are not | 
|  | 99 | ; next to each other in memory. Here we have a hole with bytes #4-5. | 
|  | 100 | ; REGULAR-LABEL: t2: | 
|  | 101 | ; REGULAR: shrq $48 | 
|  | 102 | define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) { | 
|  | 103 | %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start | 
|  | 104 | %bitcast = bitcast %class.Complex* %arrayidx to i64* | 
|  | 105 | %chunk64 = load i64* %bitcast, align 8 | 
|  | 106 | %slice32_low = trunc i64 %chunk64 to i32 | 
|  | 107 | %shift48 = lshr i64 %chunk64, 48 | 
|  | 108 | %slice32_high = trunc i64 %shift48 to i32 | 
|  | 109 | %res = add i32 %slice32_high, %slice32_low | 
|  | 110 | ret i32 %res | 
|  | 111 | } | 
|  | 112 |  | 
|  | 113 | ; Check that we do not optimize overlapping slices. | 
|  | 114 | ; | 
|  | 115 | ; The 64-bits should NOT have been split in as slices are overlapping. | 
|  | 116 | ; First slice uses bytes numbered 0 to 3. | 
|  | 117 | ; Second slice uses bytes numbered 6 and 7. | 
|  | 118 | ; Third slice uses bytes numbered 4 to 7. | 
|  | 119 | ; | 
|  | 120 | ; STRESS-LABEL: t3: | 
|  | 121 | ; STRESS: shrq $48 | 
|  | 122 | ; STRESS: shrq $32 | 
|  | 123 | ; | 
|  | 124 | ; REGULAR-LABEL: t3: | 
|  | 125 | ; REGULAR: shrq $48 | 
|  | 126 | ; REGULAR: shrq $32 | 
|  | 127 | define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) { | 
|  | 128 | %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start | 
|  | 129 | %bitcast = bitcast %class.Complex* %arrayidx to i64* | 
|  | 130 | %chunk64 = load i64* %bitcast, align 8 | 
|  | 131 | %slice32_low = trunc i64 %chunk64 to i32 | 
|  | 132 | %shift48 = lshr i64 %chunk64, 48 | 
|  | 133 | %slice32_high = trunc i64 %shift48 to i32 | 
|  | 134 | %shift32 = lshr i64 %chunk64, 32 | 
|  | 135 | %slice32_lowhigh = trunc i64 %shift32 to i32 | 
|  | 136 | %tmpres = add i32 %slice32_high, %slice32_low | 
|  | 137 | %res = add i32 %slice32_lowhigh, %tmpres | 
|  | 138 | ret i32 %res | 
|  | 139 | } |