Blame - llvm/test/CodeGen/AMDGPU/merge-stores.ll - toolchain/llvm-project

blob: dbf9d4481ffb1bb8943d50f6fdf21955d2e8e753 [file] [log] [blame]

Matt Arsenault	65ad160	2015-05-24 00:51:27 +0000	[diff] [blame]	1	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=GCN %s
				2	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=GCN %s
				3
				4	; Run with devices with different unaligned load restrictions.
				5
				6	; TODO: Vector element tests
				7	; TODO: Non-zero base offset for load and store combinations
				8	; TODO: Same base addrspacecasted
				9
				10
				11	; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
				12	; GCN: buffer_store_byte
				13	; GCN: buffer_store_byte
				14	; GCN: s_endpgm
				15	define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
				16	%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
				17
				18	store i8 123, i8 addrspace(1)* %out.gep.1
				19	store i8 456, i8 addrspace(1)* %out, align 2
				20	ret void
				21	}
				22
				23	; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
				24	; GCN: buffer_store_byte
				25	; GCN: buffer_store_byte
				26	; GCN: s_endpgm
				27	define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
				28	%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
				29
				30	store i8 123, i8 addrspace(1)* %out.gep.1
				31	store i8 456, i8 addrspace(1)* %out
				32	ret void
				33	}
				34
				35	; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
				36	; GCN: buffer_store_dword v
				37	define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
				38	%out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
				39
				40	store i16 123, i16 addrspace(1)* %out.gep.1
				41	store i16 456, i16 addrspace(1)* %out, align 4
				42	ret void
				43	}
				44
				45	; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
				46	; GCN: buffer_store_dword v
				47	define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
				48	%out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
				49
				50	store i16 0, i16 addrspace(1)* %out.gep.1
				51	store i16 0, i16 addrspace(1)* %out, align 4
				52	ret void
				53	}
				54
				55	; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
				56	; GCN: buffer_store_short
				57	; GCN: buffer_store_short
				58	; GCN: s_endpgm
				59	define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
				60	%out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
				61
				62	store i16 123, i16 addrspace(1)* %out.gep.1
				63	store i16 456, i16 addrspace(1)* %out
				64	ret void
				65	}
				66
				67	; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
				68	; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
				69	; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
				70	; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
				71	; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
				72	; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
				73	define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
				74	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				75
				76	store i32 123, i32 addrspace(1)* %out.gep.1
				77	store i32 456, i32 addrspace(1)* %out
				78	ret void
				79	}
				80
				81	; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
				82	; GCN: buffer_store_dwordx2
				83	define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
				84	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				85	%out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
				86	store float 1.0, float addrspace(1)* %out.gep.1.bc
				87	store i32 456, i32 addrspace(1)* %out
				88	ret void
				89	}
				90
				91	; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
Reid Kleckner	2691c59	2015-06-11 17:25:24 +0000	[diff] [blame]	92	; GCN: buffer_store_dwordx2
Matt Arsenault	65ad160	2015-05-24 00:51:27 +0000	[diff] [blame]	93	define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
				94	%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
				95	%out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
				96	store i32 123, i32 addrspace(1)* %out.gep.1.bc
				97	store float 4.0, float addrspace(1)* %out
				98	ret void
				99	}
				100
				101	; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
Reid Kleckner	2691c59	2015-06-11 17:25:24 +0000	[diff] [blame]	102	; GCN: buffer_store_dwordx4
Matt Arsenault	65ad160	2015-05-24 00:51:27 +0000	[diff] [blame]	103	define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
				104	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				105	%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
				106	%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
				107
				108	store i32 123, i32 addrspace(1)* %out.gep.1
				109	store i32 456, i32 addrspace(1)* %out.gep.2
				110	store i32 333, i32 addrspace(1)* %out.gep.3
				111	store i32 1234, i32 addrspace(1)* %out
				112	ret void
				113	}
				114
				115	; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
				116	; XGCN: buffer_store_dwordx4
				117	; GCN: buffer_store_dword v
				118	; GCN: buffer_store_dword v
				119	; GCN: buffer_store_dwordx2 v
				120	define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
				121	%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
				122	%out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
				123	%out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
				124
				125	store float 8.0, float addrspace(1)* %out
				126	store float 1.0, float addrspace(1)* %out.gep.1
				127	store float 2.0, float addrspace(1)* %out.gep.2
				128	store float 4.0, float addrspace(1)* %out.gep.3
				129	ret void
				130	}
				131
				132	; First store is out of order. Because of order of combines, the
				133	; consecutive store fails because only some of the stores have been
				134	; replaced with integer constant stores, and then won't merge because
				135	; the types are different.
				136
				137	; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
				138	; XGCN: buffer_store_dwordx4
				139	; GCN: buffer_store_dword v
				140	; GCN: buffer_store_dword v
				141	; GCN: buffer_store_dword v
				142	; GCN: buffer_store_dword v
				143	define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
				144	%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
				145	%out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
				146	%out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
				147
				148	store float 1.0, float addrspace(1)* %out.gep.1
				149	store float 2.0, float addrspace(1)* %out.gep.2
				150	store float 4.0, float addrspace(1)* %out.gep.3
				151	store float 8.0, float addrspace(1)* %out
				152	ret void
				153	}
				154
				155	; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
				156	; SI-DAG: buffer_store_dwordx2
				157	; SI-DAG: buffer_store_dword
				158	; SI-NOT: buffer_store_dword
				159	; GCN: s_endpgm
				160	define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
				161	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				162	%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
				163
				164	store i32 123, i32 addrspace(1)* %out.gep.1
				165	store i32 456, i32 addrspace(1)* %out.gep.2
				166	store i32 1234, i32 addrspace(1)* %out
				167	ret void
				168	}
				169
				170	; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
				171	; XGCN: buffer_store_dwordx4
				172	; GCN: buffer_store_dwordx2
				173	; GCN: buffer_store_dwordx2
				174	define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
				175	%out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
				176
				177	store i64 123, i64 addrspace(1)* %out.gep.1
				178	store i64 456, i64 addrspace(1)* %out
				179	ret void
				180	}
				181
				182	; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
				183	; XGCN: buffer_store_dwordx4
				184	; XGCN: buffer_store_dwordx4
				185
				186	; GCN: buffer_store_dwordx2
				187	; GCN: buffer_store_dwordx2
				188	; GCN: buffer_store_dwordx2
				189	; GCN: buffer_store_dwordx2
				190	define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
				191	%out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
				192	%out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
				193	%out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
				194
				195	store i64 123, i64 addrspace(1)* %out.gep.1
				196	store i64 456, i64 addrspace(1)* %out.gep.2
				197	store i64 333, i64 addrspace(1)* %out.gep.3
				198	store i64 1234, i64 addrspace(1)* %out
				199	ret void
				200	}
				201
				202	; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
				203	; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
				204	; GCN: buffer_store_dwordx2 [[LOAD]]
				205	define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
				206	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				207	%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
				208
				209	%lo = load i32, i32 addrspace(1)* %in
				210	%hi = load i32, i32 addrspace(1)* %in.gep.1
				211
				212	store i32 %lo, i32 addrspace(1)* %out
				213	store i32 %hi, i32 addrspace(1)* %out.gep.1
				214	ret void
				215	}
				216
				217	; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
				218	; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
				219	; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
				220	define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
				221	%in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
				222	%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
				223
				224	%out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
				225	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
				226	%lo = load i32, i32 addrspace(1)* %in.gep.0
				227	%hi = load i32, i32 addrspace(1)* %in.gep.1
				228
				229	store i32 %lo, i32 addrspace(1)* %out.gep.0
				230	store i32 %hi, i32 addrspace(1)* %out.gep.1
				231	ret void
				232	}
				233
				234	; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
				235	; GCN: buffer_load_dword v
				236	; GCN: buffer_load_dword v
				237	; GCN: buffer_store_dword v
				238	; GCN: buffer_store_dword v
				239	define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
				240	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				241	%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
				242
				243	%lo = load i32, i32 addrspace(1)* %in
				244	%hi = load i32, i32 addrspace(1)* %in.gep.1
				245
				246	store i32 %hi, i32 addrspace(1)* %out
				247	store i32 %lo, i32 addrspace(1)* %out.gep.1
				248	ret void
				249	}
				250
				251	; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
				252	; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
				253	; GCN: buffer_store_dwordx4 [[LOAD]]
				254	define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
				255	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				256	%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
				257	%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
				258	%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
				259	%in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
				260	%in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
				261
				262	%x = load i32, i32 addrspace(1)* %in
				263	%y = load i32, i32 addrspace(1)* %in.gep.1
				264	%z = load i32, i32 addrspace(1)* %in.gep.2
				265	%w = load i32, i32 addrspace(1)* %in.gep.3
				266
				267	store i32 %x, i32 addrspace(1)* %out
				268	store i32 %y, i32 addrspace(1)* %out.gep.1
				269	store i32 %z, i32 addrspace(1)* %out.gep.2
				270	store i32 %w, i32 addrspace(1)* %out.gep.3
				271	ret void
				272	}
				273
				274	; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
				275	; SI-DAG: buffer_load_dwordx2
				276	; SI-DAG: buffer_load_dword v
				277	; GCN: s_waitcnt
				278	; SI-DAG: buffer_store_dword v
				279	; SI-DAG: buffer_store_dwordx2 v
				280	; GCN: s_endpgm
				281	define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
				282	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				283	%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
				284	%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
				285	%in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
				286
				287	%x = load i32, i32 addrspace(1)* %in
				288	%y = load i32, i32 addrspace(1)* %in.gep.1
				289	%z = load i32, i32 addrspace(1)* %in.gep.2
				290
				291	store i32 %x, i32 addrspace(1)* %out
				292	store i32 %y, i32 addrspace(1)* %out.gep.1
				293	store i32 %z, i32 addrspace(1)* %out.gep.2
				294	ret void
				295	}
				296
				297	; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
				298	; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
				299	; GCN: buffer_store_dwordx4 [[LOAD]]
				300	define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
				301	%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
				302	%out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
				303	%out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
				304	%in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
				305	%in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
				306	%in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
				307
				308	%x = load float, float addrspace(1)* %in
				309	%y = load float, float addrspace(1)* %in.gep.1
				310	%z = load float, float addrspace(1)* %in.gep.2
				311	%w = load float, float addrspace(1)* %in.gep.3
				312
				313	store float %x, float addrspace(1)* %out
				314	store float %y, float addrspace(1)* %out.gep.1
				315	store float %z, float addrspace(1)* %out.gep.2
				316	store float %w, float addrspace(1)* %out.gep.3
				317	ret void
				318	}
				319
				320	; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
				321	; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
				322	; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
				323	define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
				324	%in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
				325	%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
				326	%in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
				327	%in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
				328	%out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
				329	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
				330	%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
				331	%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
				332
				333	%x = load i32, i32 addrspace(1)* %in.gep.0
				334	%y = load i32, i32 addrspace(1)* %in.gep.1
				335	%z = load i32, i32 addrspace(1)* %in.gep.2
				336	%w = load i32, i32 addrspace(1)* %in.gep.3
				337
				338	store i32 %x, i32 addrspace(1)* %out.gep.0
				339	store i32 %y, i32 addrspace(1)* %out.gep.1
				340	store i32 %z, i32 addrspace(1)* %out.gep.2
				341	store i32 %w, i32 addrspace(1)* %out.gep.3
				342	ret void
				343	}
				344
				345	; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
				346	; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
				347	; GCN: s_barrier
				348	; GCN: buffer_store_dwordx4 [[LOAD]]
				349	define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
				350	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				351	%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
				352	%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
				353	%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
				354	%in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
				355	%in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
				356
				357	%x = load i32, i32 addrspace(1)* %in
				358	%y = load i32, i32 addrspace(1)* %in.gep.1
				359	%z = load i32, i32 addrspace(1)* %in.gep.2
				360	%w = load i32, i32 addrspace(1)* %in.gep.3
				361
				362	; Make sure the barrier doesn't stop this
				363	tail call void @llvm.AMDGPU.barrier.local() #1
				364
				365	store i32 %w, i32 addrspace(1)* %out.gep.3
				366	store i32 %z, i32 addrspace(1)* %out.gep.2
				367	store i32 %y, i32 addrspace(1)* %out.gep.1
				368	store i32 %x, i32 addrspace(1)* %out
				369
				370	ret void
				371	}
				372
				373	; TODO: Re-packing of loaded register required. Maybe an IR pass
				374	; should catch this?
				375
				376	; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
				377	; GCN: buffer_load_dword v
				378	; GCN: buffer_load_dword v
				379	; GCN: buffer_load_dword v
				380	; GCN: buffer_load_dword v
				381	; GCN: s_barrier
				382	; GCN: buffer_store_dword v
				383	; GCN: buffer_store_dword v
				384	; GCN: buffer_store_dword v
				385	; GCN: buffer_store_dword v
				386	define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
				387	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				388	%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
				389	%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
				390	%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
				391	%in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
				392	%in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
				393
				394	%x = load i32, i32 addrspace(1)* %in
				395	%y = load i32, i32 addrspace(1)* %in.gep.1
				396	%z = load i32, i32 addrspace(1)* %in.gep.2
				397	%w = load i32, i32 addrspace(1)* %in.gep.3
				398
				399	; Make sure the barrier doesn't stop this
				400	tail call void @llvm.AMDGPU.barrier.local() #1
				401
				402	store i32 %w, i32 addrspace(1)* %out
				403	store i32 %z, i32 addrspace(1)* %out.gep.1
				404	store i32 %y, i32 addrspace(1)* %out.gep.2
				405	store i32 %x, i32 addrspace(1)* %out.gep.3
				406
				407	ret void
				408	}
				409
				410	; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
				411	; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
				412	; GCN: buffer_store_dword [[LOAD]]
				413	; GCN: s_endpgm
				414	define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
				415	%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
				416	%out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
				417	%out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
				418	%in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
				419	%in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
				420	%in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
				421
				422	%x = load i8, i8 addrspace(1)* %in, align 4
				423	%y = load i8, i8 addrspace(1)* %in.gep.1
				424	%z = load i8, i8 addrspace(1)* %in.gep.2
				425	%w = load i8, i8 addrspace(1)* %in.gep.3
				426
				427	store i8 %x, i8 addrspace(1)* %out, align 4
				428	store i8 %y, i8 addrspace(1)* %out.gep.1
				429	store i8 %z, i8 addrspace(1)* %out.gep.2
				430	store i8 %w, i8 addrspace(1)* %out.gep.3
				431	ret void
				432	}
				433
				434	; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
				435	; GCN: buffer_load_ubyte
				436	; GCN: buffer_load_ubyte
				437	; GCN: buffer_load_ubyte
				438	; GCN: buffer_load_ubyte
				439	; GCN: buffer_store_byte
				440	; GCN: buffer_store_byte
				441	; GCN: buffer_store_byte
				442	; GCN: buffer_store_byte
				443	; GCN: s_endpgm
				444	define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
				445	%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
				446	%out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
				447	%out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
				448	%in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
				449	%in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
				450	%in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
				451
				452	%x = load i8, i8 addrspace(1)* %in
				453	%y = load i8, i8 addrspace(1)* %in.gep.1
				454	%z = load i8, i8 addrspace(1)* %in.gep.2
				455	%w = load i8, i8 addrspace(1)* %in.gep.3
				456
				457	store i8 %x, i8 addrspace(1)* %out
				458	store i8 %y, i8 addrspace(1)* %out.gep.1
				459	store i8 %z, i8 addrspace(1)* %out.gep.2
				460	store i8 %w, i8 addrspace(1)* %out.gep.3
				461	ret void
				462	}
				463
				464	; This works once AA is enabled on the subtarget
				465	; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
				466	; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
				467	; XGCN: buffer_store_dwordx4 [[LOAD]]
				468	; GCN: buffer_store_dword v
				469	; GCN: buffer_store_dword v
				470	; GCN: buffer_store_dword v
				471	; GCN: buffer_store_dword v
				472	define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
				473	%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
				474	%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
				475	%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
				476	%vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
				477
				478	%x = extractelement <4 x i32> %vec, i32 0
				479	%y = extractelement <4 x i32> %vec, i32 1
				480	%z = extractelement <4 x i32> %vec, i32 2
				481	%w = extractelement <4 x i32> %vec, i32 3
				482
				483	store i32 %x, i32 addrspace(1)* %out
				484	store i32 %y, i32 addrspace(1)* %out.gep.1
				485	store i32 %z, i32 addrspace(1)* %out.gep.2
				486	store i32 %w, i32 addrspace(1)* %out.gep.3
				487	ret void
				488	}
				489
				490	; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
				491	; GCN: ds_write_b8
				492	; GCN: ds_write_b8
				493	; GCN: s_endpgm
				494	define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
				495	%out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
				496
				497	store i8 123, i8 addrspace(3)* %out.gep.1
				498	store i8 456, i8 addrspace(3)* %out, align 2
				499	ret void
				500	}
				501
				502	; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
				503	; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
				504	; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
				505	; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
				506	; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
				507	; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
				508	define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
				509	%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
				510
				511	store i32 123, i32 addrspace(3)* %out.gep.1
				512	store i32 456, i32 addrspace(3)* %out
				513	ret void
				514	}
				515
				516	; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
				517	; GCN: ds_write_b32
				518	; GCN: ds_write_b32
				519	; GCN: ds_write_b32
				520	; GCN: ds_write_b32
				521	define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
				522	%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
				523	%out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
				524	%out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
				525
				526	store i32 123, i32 addrspace(3)* %out.gep.1
				527	store i32 456, i32 addrspace(3)* %out.gep.2
				528	store i32 333, i32 addrspace(3)* %out.gep.3
				529	store i32 1234, i32 addrspace(3)* %out
				530	ret void
				531	}
				532
				533	declare void @llvm.AMDGPU.barrier.local() #1
				534
				535	attributes #0 = { nounwind }
				536	attributes #1 = { noduplicate nounwind }