Blame - llvm/test/CodeGen/X86/load-slice.ll - toolchain/llvm-project

blob: 49eb13160bbc140783c1837d41610d5f7645067b [file] [log] [blame]

Benjamin Kramer	f016258	2013-10-12 11:17:12 +0000	[diff] [blame]	1	; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - \| FileCheck %s --check-prefix=STRESS
				2	; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - \| FileCheck %s --check-prefix=REGULAR
Quentin Colombet	de0e062	2013-10-11 18:29:42 +0000	[diff] [blame]	3	;
				4	; <rdar://problem/14477220>
				5
				6	%class.Complex = type { float, float }
				7
				8
Alp Toker	cb40291	2014-01-24 17:20:08 +0000	[diff] [blame]	9	; Check that independent slices leads to independent loads then the slices leads to
Quentin Colombet	de0e062	2013-10-11 18:29:42 +0000	[diff] [blame]	10	; different register file.
				11	;
				12	; The layout is:
				13	; LSB 0 1 2 3 \| 4 5 6 7 MSB
				14	; Low High
				15	; The base address points to 0 and is 8-bytes aligned.
				16	; Low slice starts at 0 (base) and is 8-bytes aligned.
				17	; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
				18	;
				19	; STRESS-LABEL: t1:
Quentin Colombet	de0e062	2013-10-11 18:29:42 +0000	[diff] [blame]	20	; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
Andrew Trick	e97d8d6	2013-10-15 23:33:07 +0000	[diff] [blame]	21	; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
Quentin Colombet	de0e062	2013-10-11 18:29:42 +0000	[diff] [blame]	22	; Add low slice: out[out_start].real, this is base + 0.
				23	; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
Andrew Trick	e97d8d6	2013-10-15 23:33:07 +0000	[diff] [blame]	24	; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
				25	; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
				26	; Add high slice: out[out_start].imm, this is base + 4.
				27	; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
Quentin Colombet	de0e062	2013-10-11 18:29:42 +0000	[diff] [blame]	28	; Swap Imm and Real.
				29	; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
				30	; Put the results back into out[out_start].
				31	; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]])
				32	;
				33	; Same for REGULAR, we eliminate register bank copy with each slices.
				34	; REGULAR-LABEL: t1:
Quentin Colombet	de0e062	2013-10-11 18:29:42 +0000	[diff] [blame]	35	; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
Andrew Trick	e97d8d6	2013-10-15 23:33:07 +0000	[diff] [blame]	36	; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
Quentin Colombet	de0e062	2013-10-11 18:29:42 +0000	[diff] [blame]	37	; Add low slice: out[out_start].real, this is base + 0.
				38	; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
Andrew Trick	e97d8d6	2013-10-15 23:33:07 +0000	[diff] [blame]	39	; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
				40	; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
				41	; Add high slice: out[out_start].imm, this is base + 4.
				42	; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
Quentin Colombet	de0e062	2013-10-11 18:29:42 +0000	[diff] [blame]	43	; Swap Imm and Real.
				44	; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
				45	; Put the results back into out[out_start].
				46	; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]])
				47	define void @t1(%class.Complex* nocapture %out, i64 %out_start) {
				48	entry:
				49	%arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
				50	%tmp = bitcast %class.Complex* %arrayidx to i64*
				51	%tmp1 = load i64* %tmp, align 8
				52	%t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
				53	%tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
				54	%t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
				55	%t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
				56	%tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
				57	%add = add i64 %out_start, 8
				58	%arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
				59	%i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
				60	%tmp4 = load float* %i.i, align 4
				61	%add.i = fadd float %tmp4, %tmp2
				62	%retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
				63	%r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
				64	%tmp5 = load float* %r.i, align 4
				65	%add5.i = fadd float %tmp5, %tmp3
				66	%retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
				67	%ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
				68	store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
				69	ret void
				70	}
				71
				72	; Function Attrs: nounwind
				73	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
				74
				75	; Function Attrs: nounwind
				76	declare void @llvm.lifetime.start(i64, i8* nocapture)
				77
				78	; Function Attrs: nounwind
				79	declare void @llvm.lifetime.end(i64, i8* nocapture)
				80
				81	; Check that we do not read outside of the chunk of bits of the original loads.
				82	;
				83	; The 64-bits should have been split in one 32-bits and one 16-bits slices.
				84	; The 16-bits should be zero extended to match the final type.
				85	;
				86	; The memory layout is:
				87	; LSB 0 1 2 3 \| 4 5 \| 6 7 MSB
				88	; Low High
				89	; The base address points to 0 and is 8-bytes aligned.
				90	; Low slice starts at 0 (base) and is 8-bytes aligned.
				91	; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
				92	;
				93	; STRESS-LABEL: t2:
				94	; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
				95	; STRESS-NEXT: addl ([[BASE]]), %eax
				96	; STRESS-NEXT: ret
				97	;
				98	; For the REGULAR heuristic, this is not profitable to slice things that are not
				99	; next to each other in memory. Here we have a hole with bytes #4-5.
				100	; REGULAR-LABEL: t2:
				101	; REGULAR: shrq $48
				102	define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
				103	%arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
				104	%bitcast = bitcast %class.Complex* %arrayidx to i64*
				105	%chunk64 = load i64* %bitcast, align 8
				106	%slice32_low = trunc i64 %chunk64 to i32
				107	%shift48 = lshr i64 %chunk64, 48
				108	%slice32_high = trunc i64 %shift48 to i32
				109	%res = add i32 %slice32_high, %slice32_low
				110	ret i32 %res
				111	}
				112
				113	; Check that we do not optimize overlapping slices.
				114	;
				115	; The 64-bits should NOT have been split in as slices are overlapping.
				116	; First slice uses bytes numbered 0 to 3.
				117	; Second slice uses bytes numbered 6 and 7.
				118	; Third slice uses bytes numbered 4 to 7.
				119	;
				120	; STRESS-LABEL: t3:
				121	; STRESS: shrq $48
				122	; STRESS: shrq $32
				123	;
				124	; REGULAR-LABEL: t3:
				125	; REGULAR: shrq $48
				126	; REGULAR: shrq $32
				127	define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
				128	%arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
				129	%bitcast = bitcast %class.Complex* %arrayidx to i64*
				130	%chunk64 = load i64* %bitcast, align 8
				131	%slice32_low = trunc i64 %chunk64 to i32
				132	%shift48 = lshr i64 %chunk64, 48
				133	%slice32_high = trunc i64 %shift48 to i32
				134	%shift32 = lshr i64 %chunk64, 32
				135	%slice32_lowhigh = trunc i64 %shift32 to i32
				136	%tmpres = add i32 %slice32_high, %slice32_low
				137	%res = add i32 %slice32_lowhigh, %tmpres
				138	ret i32 %res
				139	}