Blame - llvm/test/CodeGen/R600/ds_write2.ll - toolchain/llvm-project

blob: 3a3c8368682e1d63e7d17a9684654ce8a09be166 [file] [log] [blame]

Matt Arsenault	4103328	2014-10-10 22:01:59 +0000	[diff] [blame^]	1	; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s \| FileCheck -check-prefix=SI %s
				2
				3	@lds = addrspace(3) global [512 x float] zeroinitializer, align 4
				4	@lds.f64 = addrspace(3) global [512 x double] zeroinitializer, align 8
				5
				6
				7	; SI-LABEL: @simple_write2_one_val_f32
				8	; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]]
				9	; SI-DAG: V_LSHLREV_B32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				10	; SI: DS_WRITE2_B32 [[VPTR]], [[VAL]], [[VAL]], 0x0, 0x8 [M0]
				11	; SI: S_ENDPGM
				12	define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
				13	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				14	%in.gep = getelementptr float addrspace(1)* %in, i32 %x.i
				15	%val = load float addrspace(1)* %in.gep, align 4
				16	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
				17	store float %val, float addrspace(3)* %arrayidx0, align 4
				18	%add.x = add nsw i32 %x.i, 8
				19	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
				20	store float %val, float addrspace(3)* %arrayidx1, align 4
				21	ret void
				22	}
				23
				24	; SI-LABEL: @simple_write2_two_val_f32
				25	; SI-DAG: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
				26	; SI-DAG: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
				27	; SI-DAG: V_LSHLREV_B32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				28	; SI: DS_WRITE2_B32 [[VPTR]], [[VAL0]], [[VAL1]], 0x0, 0x8 [M0]
				29	; SI: S_ENDPGM
				30	define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
				31	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				32	%in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
				33	%in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
				34	%val0 = load float addrspace(1)* %in.gep.0, align 4
				35	%val1 = load float addrspace(1)* %in.gep.1, align 4
				36	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
				37	store float %val0, float addrspace(3)* %arrayidx0, align 4
				38	%add.x = add nsw i32 %x.i, 8
				39	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
				40	store float %val1, float addrspace(3)* %arrayidx1, align 4
				41	ret void
				42	}
				43
				44	; SI-LABEL: @simple_write2_two_val_f32_volatile_0
				45	; SI-NOT: DS_WRITE2_B32
				46	; SI: DS_WRITE_B32 {{v[0-9]+}}, {{v[0-9]+}}, 0x0
				47	; SI: DS_WRITE_B32 {{v[0-9]+}}, {{v[0-9]+}}, 0x20
				48	; SI: S_ENDPGM
				49	define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
				50	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				51	%in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
				52	%in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
				53	%val0 = load float addrspace(1)* %in0.gep, align 4
				54	%val1 = load float addrspace(1)* %in1.gep, align 4
				55	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
				56	store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
				57	%add.x = add nsw i32 %x.i, 8
				58	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
				59	store float %val1, float addrspace(3)* %arrayidx1, align 4
				60	ret void
				61	}
				62
				63	; SI-LABEL: @simple_write2_two_val_f32_volatile_1
				64	; SI-NOT: DS_WRITE2_B32
				65	; SI: DS_WRITE_B32 {{v[0-9]+}}, {{v[0-9]+}}, 0x0
				66	; SI: DS_WRITE_B32 {{v[0-9]+}}, {{v[0-9]+}}, 0x20
				67	; SI: S_ENDPGM
				68	define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
				69	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				70	%in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
				71	%in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
				72	%val0 = load float addrspace(1)* %in0.gep, align 4
				73	%val1 = load float addrspace(1)* %in1.gep, align 4
				74	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
				75	store float %val0, float addrspace(3)* %arrayidx0, align 4
				76	%add.x = add nsw i32 %x.i, 8
				77	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
				78	store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
				79	ret void
				80	}
				81
				82	; 2 data subregisters from different super registers.
				83	; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32
				84	; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
				85	; SI: BUFFER_LOAD_DWORDX2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
				86	; SI: V_LSHLREV_B32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				87	; SI: DS_WRITE2_B32 [[VPTR]], v[[VAL0]], v[[VAL1]], 0x0, 0x8 [M0]
				88	; SI: S_ENDPGM
				89	define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
				90	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				91	%in.gep.0 = getelementptr <2 x float> addrspace(1)* %in, i32 %x.i
				92	%in.gep.1 = getelementptr <2 x float> addrspace(1)* %in.gep.0, i32 1
				93	%val0 = load <2 x float> addrspace(1)* %in.gep.0, align 8
				94	%val1 = load <2 x float> addrspace(1)* %in.gep.1, align 8
				95	%val0.0 = extractelement <2 x float> %val0, i32 0
				96	%val1.1 = extractelement <2 x float> %val1, i32 1
				97	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
				98	store float %val0.0, float addrspace(3)* %arrayidx0, align 4
				99	%add.x = add nsw i32 %x.i, 8
				100	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
				101	store float %val1.1, float addrspace(3)* %arrayidx1, align 4
				102	ret void
				103	}
				104
				105	; SI-LABEL: @simple_write2_two_val_subreg2_f32
				106	; SI-DAG: BUFFER_LOAD_DWORDX2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
				107	; SI-DAG: V_LSHLREV_B32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				108	; SI: DS_WRITE2_B32 [[VPTR]], v[[VAL0]], v[[VAL1]], 0x0, 0x8 [M0]
				109	; SI: S_ENDPGM
				110	define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
				111	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				112	%in.gep = getelementptr <2 x float> addrspace(1)* %in, i32 %x.i
				113	%val = load <2 x float> addrspace(1)* %in.gep, align 8
				114	%val0 = extractelement <2 x float> %val, i32 0
				115	%val1 = extractelement <2 x float> %val, i32 1
				116	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
				117	store float %val0, float addrspace(3)* %arrayidx0, align 4
				118	%add.x = add nsw i32 %x.i, 8
				119	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
				120	store float %val1, float addrspace(3)* %arrayidx1, align 4
				121	ret void
				122	}
				123
				124	; SI-LABEL: @simple_write2_two_val_subreg4_f32
				125	; SI-DAG: BUFFER_LOAD_DWORDX4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
				126	; SI-DAG: V_LSHLREV_B32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				127	; SI: DS_WRITE2_B32 [[VPTR]], v[[VAL0]], v[[VAL1]], 0x0, 0x8 [M0]
				128	; SI: S_ENDPGM
				129	define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
				130	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				131	%in.gep = getelementptr <4 x float> addrspace(1)* %in, i32 %x.i
				132	%val = load <4 x float> addrspace(1)* %in.gep, align 16
				133	%val0 = extractelement <4 x float> %val, i32 0
				134	%val1 = extractelement <4 x float> %val, i32 3
				135	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
				136	store float %val0, float addrspace(3)* %arrayidx0, align 4
				137	%add.x = add nsw i32 %x.i, 8
				138	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
				139	store float %val1, float addrspace(3)* %arrayidx1, align 4
				140	ret void
				141	}
				142
				143	; SI-LABEL: @simple_write2_two_val_max_offset_f32
				144	; SI-DAG: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
				145	; SI-DAG: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
				146	; SI-DAG: V_LSHLREV_B32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				147	; SI: DS_WRITE2_B32 [[VPTR]], [[VAL0]], [[VAL1]], 0x0, 0xff [M0]
				148	; SI: S_ENDPGM
				149	define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
				150	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				151	%in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
				152	%in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
				153	%val0 = load float addrspace(1)* %in.gep.0, align 4
				154	%val1 = load float addrspace(1)* %in.gep.1, align 4
				155	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
				156	store float %val0, float addrspace(3)* %arrayidx0, align 4
				157	%add.x = add nsw i32 %x.i, 255
				158	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
				159	store float %val1, float addrspace(3)* %arrayidx1, align 4
				160	ret void
				161	}
				162
				163	; SI-LABEL: @simple_write2_two_val_too_far_f32
				164	; SI: DS_WRITE_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0x0
				165	; SI: DS_WRITE_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0x400
				166	; SI: S_ENDPGM
				167	define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
				168	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				169	%in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
				170	%in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
				171	%val0 = load float addrspace(1)* %in0.gep, align 4
				172	%val1 = load float addrspace(1)* %in1.gep, align 4
				173	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
				174	store float %val0, float addrspace(3)* %arrayidx0, align 4
				175	%add.x = add nsw i32 %x.i, 256
				176	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
				177	store float %val1, float addrspace(3)* %arrayidx1, align 4
				178	ret void
				179	}
				180
				181	; SI-LABEL: @simple_write2_two_val_f32_x2
				182	; SI: DS_WRITE2_B32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]], 0x0, 0x8
				183	; SI-NEXT: DS_WRITE2_B32 [[BASEADDR]], [[VAL0]], [[VAL1]], 0xb, 0x1b
				184	; SI: S_ENDPGM
				185	define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
				186	%tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
				187	%in0.gep = getelementptr float addrspace(1)* %in0, i32 %tid.x
				188	%in1.gep = getelementptr float addrspace(1)* %in1, i32 %tid.x
				189	%val0 = load float addrspace(1)* %in0.gep, align 4
				190	%val1 = load float addrspace(1)* %in1.gep, align 4
				191
				192	%idx.0 = add nsw i32 %tid.x, 0
				193	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
				194	store float %val0, float addrspace(3)* %arrayidx0, align 4
				195
				196	%idx.1 = add nsw i32 %tid.x, 8
				197	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
				198	store float %val1, float addrspace(3)* %arrayidx1, align 4
				199
				200	%idx.2 = add nsw i32 %tid.x, 11
				201	%arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
				202	store float %val0, float addrspace(3)* %arrayidx2, align 4
				203
				204	%idx.3 = add nsw i32 %tid.x, 27
				205	%arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
				206	store float %val1, float addrspace(3)* %arrayidx3, align 4
				207
				208	ret void
				209	}
				210
				211	; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
				212	; SI: DS_WRITE2_B32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]], 0x3, 0x8
				213	; SI-NEXT: DS_WRITE2_B32 [[BASEADDR]], [[VAL0]], [[VAL1]], 0xb, 0x1b
				214	; SI: S_ENDPGM
				215	define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
				216	%tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
				217	%in0.gep = getelementptr float addrspace(1)* %in0, i32 %tid.x
				218	%in1.gep = getelementptr float addrspace(1)* %in1, i32 %tid.x
				219	%val0 = load float addrspace(1)* %in0.gep, align 4
				220	%val1 = load float addrspace(1)* %in1.gep, align 4
				221
				222	%idx.0 = add nsw i32 %tid.x, 3
				223	%arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
				224	store float %val0, float addrspace(3)* %arrayidx0, align 4
				225
				226	%idx.1 = add nsw i32 %tid.x, 8
				227	%arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
				228	store float %val1, float addrspace(3)* %arrayidx1, align 4
				229
				230	%idx.2 = add nsw i32 %tid.x, 11
				231	%arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
				232	store float %val0, float addrspace(3)* %arrayidx2, align 4
				233
				234	%idx.3 = add nsw i32 %tid.x, 27
				235	%arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
				236	store float %val1, float addrspace(3)* %arrayidx3, align 4
				237
				238	ret void
				239	}
				240
				241	; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32
				242	; SI-NOT: DS_WRITE2_B32
				243	; SI: DS_WRITE_B32
				244	; SI: DS_WRITE_B32
				245	; SI: S_ENDPGM
				246	define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
				247	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				248	%in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
				249	%in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
				250	%val0 = load float addrspace(1)* %in0.gep, align 4
				251	%val1 = load float addrspace(1)* %in1.gep, align 4
				252
				253	%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
				254	%index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
				255	%gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
				256	%gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
				257	%gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
				258
				259	; Apply an additional offset after the vector that will be more obviously folded.
				260	%gep.1.offset = getelementptr float addrspace(3)* %gep.1, i32 8
				261	store float %val0, float addrspace(3)* %gep.0, align 4
				262
				263	%add.x = add nsw i32 %x.i, 8
				264	store float %val1, float addrspace(3)* %gep.1.offset, align 4
				265	ret void
				266	}
				267
				268	; SI-LABEL: @simple_write2_one_val_f64
				269	; SI: BUFFER_LOAD_DWORDX2 [[VAL:v\[[0-9]+:[0-9]+\]]],
				270	; SI: V_LSHLREV_B32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
				271	; SI: DS_WRITE2_B64 [[VPTR]], [[VAL]], [[VAL]], 0x0, 0x8 [M0]
				272	; SI: S_ENDPGM
				273	define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
				274	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				275	%in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
				276	%val = load double addrspace(1)* %in.gep, align 8
				277	%arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
				278	store double %val, double addrspace(3)* %arrayidx0, align 8
				279	%add.x = add nsw i32 %x.i, 8
				280	%arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
				281	store double %val, double addrspace(3)* %arrayidx1, align 8
				282	ret void
				283	}
				284
				285	; SI-LABEL: @misaligned_simple_write2_one_val_f64
				286	; SI-DAG: BUFFER_LOAD_DWORDX2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
				287	; SI-DAG: V_LSHLREV_B32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
				288	; SI: DS_WRITE2_B32 [[VPTR]], v[[VAL0]], v[[VAL1]], 0x0, 0x1 [M0]
				289	; SI: DS_WRITE2_B32 [[VPTR]], v[[VAL0]], v[[VAL1]], 0xe, 0xf [M0]
				290	; SI: S_ENDPGM
				291	define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
				292	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				293	%in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
				294	%val = load double addrspace(1)* %in.gep, align 8
				295	%arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
				296	store double %val, double addrspace(3)* %arrayidx0, align 4
				297	%add.x = add nsw i32 %x.i, 7
				298	%arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
				299	store double %val, double addrspace(3)* %arrayidx1, align 4
				300	ret void
				301	}
				302
				303	; SI-LABEL: @simple_write2_two_val_f64
				304	; SI-DAG: BUFFER_LOAD_DWORDX2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
				305	; SI-DAG: BUFFER_LOAD_DWORDX2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
				306	; SI-DAG: V_LSHLREV_B32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
				307	; SI: DS_WRITE2_B64 [[VPTR]], [[VAL0]], [[VAL1]], 0x0, 0x8 [M0]
				308	; SI: S_ENDPGM
				309	define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
				310	%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
				311	%in.gep.0 = getelementptr double addrspace(1)* %in, i32 %x.i
				312	%in.gep.1 = getelementptr double addrspace(1)* %in.gep.0, i32 1
				313	%val0 = load double addrspace(1)* %in.gep.0, align 8
				314	%val1 = load double addrspace(1)* %in.gep.1, align 8
				315	%arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
				316	store double %val0, double addrspace(3)* %arrayidx0, align 8
				317	%add.x = add nsw i32 %x.i, 8
				318	%arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
				319	store double %val1, double addrspace(3)* %arrayidx1, align 8
				320	ret void
				321	}
				322
				323	@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] zeroinitializer, align 4
				324	@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] zeroinitializer, align 4
				325
				326	define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
				327	%x.i = tail call i32 @llvm.r600.read.tgid.x() #1
				328	%y.i = tail call i32 @llvm.r600.read.tidig.y() #1
				329	%val = load float addrspace(1)* %in
				330	%arrayidx44 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
				331	store float %val, float addrspace(3)* %arrayidx44, align 4
				332	%add47 = add nsw i32 %x.i, 1
				333	%arrayidx48 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
				334	store float %val, float addrspace(3)* %arrayidx48, align 4
				335	%add51 = add nsw i32 %x.i, 16
				336	%arrayidx52 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
				337	store float %val, float addrspace(3)* %arrayidx52, align 4
				338	%add55 = add nsw i32 %x.i, 17
				339	%arrayidx56 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
				340	store float %val, float addrspace(3)* %arrayidx56, align 4
				341	%arrayidx60 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
				342	store float %val, float addrspace(3)* %arrayidx60, align 4
				343	%add63 = add nsw i32 %y.i, 1
				344	%arrayidx64 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
				345	store float %val, float addrspace(3)* %arrayidx64, align 4
				346	%add67 = add nsw i32 %y.i, 32
				347	%arrayidx68 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
				348	store float %val, float addrspace(3)* %arrayidx68, align 4
				349	%add71 = add nsw i32 %y.i, 33
				350	%arrayidx72 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
				351	store float %val, float addrspace(3)* %arrayidx72, align 4
				352	%add75 = add nsw i32 %y.i, 64
				353	%arrayidx76 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
				354	store float %val, float addrspace(3)* %arrayidx76, align 4
				355	%add79 = add nsw i32 %y.i, 65
				356	%arrayidx80 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
				357	store float %val, float addrspace(3)* %arrayidx80, align 4
				358	ret void
				359	}
				360
				361	; Function Attrs: nounwind readnone
				362	declare i32 @llvm.r600.read.tgid.x() #1
				363
				364	; Function Attrs: nounwind readnone
				365	declare i32 @llvm.r600.read.tgid.y() #1
				366
				367	; Function Attrs: nounwind readnone
				368	declare i32 @llvm.r600.read.tidig.x() #1
				369
				370	; Function Attrs: nounwind readnone
				371	declare i32 @llvm.r600.read.tidig.y() #1
				372
				373	; Function Attrs: noduplicate nounwind
				374	declare void @llvm.AMDGPU.barrier.local() #2
				375
				376	attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
				377	attributes #1 = { nounwind readnone }
				378	attributes #2 = { noduplicate nounwind }