Blame - llvm/test/CodeGen/Thumb2/mve-float16regloops.ll - toolchain/llvm-project

blob: dae6883127c5828a375ebd39115eb6e503367657 [file] [log] [blame]

David Green	9cf920e	2020-03-20 08:25:19 +0000	[diff] [blame^]	1	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				2	; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - \| FileCheck %s
				3
				4	define arm_aapcs_vfpcc void @test_fadd(half* noalias nocapture readonly %A, half %BB, half noalias nocapture %C, i32 %n) {
				5	; CHECK-LABEL: test_fadd:
				6	; CHECK: @ %bb.0: @ %entry
				7	; CHECK-NEXT: cmp r3, #1
				8	; CHECK-NEXT: it lt
				9	; CHECK-NEXT: bxlt lr
				10	; CHECK-NEXT: vldr.16 s0, [r1]
				11	; CHECK-NEXT: vmov r1, s0
				12	; CHECK-NEXT: vdup.16 q0, r1
				13	; CHECK-NEXT: .LBB0_1: @ %vector.body
				14	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				15	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				16	; CHECK-NEXT: subs r3, #8
				17	; CHECK-NEXT: vadd.f16 q1, q1, q0
				18	; CHECK-NEXT: vstrb.8 q1, [r2], #16
				19	; CHECK-NEXT: bne .LBB0_1
				20	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				21	; CHECK-NEXT: bx lr
				22	entry:
				23	%B = load half, half* %BB
				24	%0 = and i32 %n, 7
				25	%cmp = icmp eq i32 %0, 0
				26	tail call void @llvm.assume(i1 %cmp)
				27	%cmp18 = icmp sgt i32 %n, 0
				28	br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
				29
				30	vector.ph: ; preds = %entry
				31	%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
				32	%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
				33	br label %vector.body
				34
				35	vector.body: ; preds = %vector.body, %vector.ph
				36	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				37	%1 = getelementptr inbounds half, half* %A, i32 %index
				38	%2 = bitcast half* %1 to <8 x half>*
				39	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				40	%3 = fadd fast <8 x half> %wide.load, %broadcast.splat11
				41	%4 = getelementptr inbounds half, half* %C, i32 %index
				42	%5 = bitcast half* %4 to <8 x half>*
				43	store <8 x half> %3, <8 x half>* %5, align 4
				44	%index.next = add i32 %index, 8
				45	%6 = icmp eq i32 %index.next, %n
				46	br i1 %6, label %for.cond.cleanup, label %vector.body
				47
				48	for.cond.cleanup: ; preds = %vector.body, %entry
				49	ret void
				50	}
				51
				52	define arm_aapcs_vfpcc void @test_fadd_r(half* noalias nocapture readonly %A, half %BB, half noalias nocapture %C, i32 %n) {
				53	; CHECK-LABEL: test_fadd_r:
				54	; CHECK: @ %bb.0: @ %entry
				55	; CHECK-NEXT: cmp r3, #1
				56	; CHECK-NEXT: it lt
				57	; CHECK-NEXT: bxlt lr
				58	; CHECK-NEXT: vldr.16 s0, [r1]
				59	; CHECK-NEXT: vmov r1, s0
				60	; CHECK-NEXT: vdup.16 q0, r1
				61	; CHECK-NEXT: .LBB1_1: @ %vector.body
				62	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				63	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				64	; CHECK-NEXT: subs r3, #8
				65	; CHECK-NEXT: vadd.f16 q1, q0, q1
				66	; CHECK-NEXT: vstrb.8 q1, [r2], #16
				67	; CHECK-NEXT: bne .LBB1_1
				68	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				69	; CHECK-NEXT: bx lr
				70	entry:
				71	%B = load half, half* %BB
				72	%0 = and i32 %n, 7
				73	%cmp = icmp eq i32 %0, 0
				74	tail call void @llvm.assume(i1 %cmp)
				75	%cmp18 = icmp sgt i32 %n, 0
				76	br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
				77
				78	vector.ph: ; preds = %entry
				79	%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
				80	%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
				81	br label %vector.body
				82
				83	vector.body: ; preds = %vector.body, %vector.ph
				84	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				85	%1 = getelementptr inbounds half, half* %A, i32 %index
				86	%2 = bitcast half* %1 to <8 x half>*
				87	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				88	%3 = fadd fast <8 x half> %broadcast.splat11, %wide.load
				89	%4 = getelementptr inbounds half, half* %C, i32 %index
				90	%5 = bitcast half* %4 to <8 x half>*
				91	store <8 x half> %3, <8 x half>* %5, align 4
				92	%index.next = add i32 %index, 8
				93	%6 = icmp eq i32 %index.next, %n
				94	br i1 %6, label %for.cond.cleanup, label %vector.body
				95
				96	for.cond.cleanup: ; preds = %vector.body, %entry
				97	ret void
				98	}
				99
				100	define arm_aapcs_vfpcc void @test_fmul(half* noalias nocapture readonly %A, half %BB, half noalias nocapture %C, i32 %n) {
				101	; CHECK-LABEL: test_fmul:
				102	; CHECK: @ %bb.0: @ %entry
				103	; CHECK-NEXT: cmp r3, #1
				104	; CHECK-NEXT: it lt
				105	; CHECK-NEXT: bxlt lr
				106	; CHECK-NEXT: vldr.16 s0, [r1]
				107	; CHECK-NEXT: vmov r1, s0
				108	; CHECK-NEXT: vdup.16 q0, r1
				109	; CHECK-NEXT: .LBB2_1: @ %vector.body
				110	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				111	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				112	; CHECK-NEXT: subs r3, #8
				113	; CHECK-NEXT: vmul.f16 q1, q1, q0
				114	; CHECK-NEXT: vstrb.8 q1, [r2], #16
				115	; CHECK-NEXT: bne .LBB2_1
				116	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				117	; CHECK-NEXT: bx lr
				118	entry:
				119	%B = load half, half* %BB
				120	%0 = and i32 %n, 7
				121	%cmp = icmp eq i32 %0, 0
				122	tail call void @llvm.assume(i1 %cmp)
				123	%cmp18 = icmp sgt i32 %n, 0
				124	br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
				125
				126	vector.ph: ; preds = %entry
				127	%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
				128	%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
				129	br label %vector.body
				130
				131	vector.body: ; preds = %vector.body, %vector.ph
				132	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				133	%1 = getelementptr inbounds half, half* %A, i32 %index
				134	%2 = bitcast half* %1 to <8 x half>*
				135	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				136	%3 = fmul fast <8 x half> %wide.load, %broadcast.splat11
				137	%4 = getelementptr inbounds half, half* %C, i32 %index
				138	%5 = bitcast half* %4 to <8 x half>*
				139	store <8 x half> %3, <8 x half>* %5, align 4
				140	%index.next = add i32 %index, 8
				141	%6 = icmp eq i32 %index.next, %n
				142	br i1 %6, label %for.cond.cleanup, label %vector.body
				143
				144	for.cond.cleanup: ; preds = %vector.body, %entry
				145	ret void
				146	}
				147
				148	define arm_aapcs_vfpcc void @test_fmul_r(half* noalias nocapture readonly %A, half %BB, half noalias nocapture %C, i32 %n) {
				149	; CHECK-LABEL: test_fmul_r:
				150	; CHECK: @ %bb.0: @ %entry
				151	; CHECK-NEXT: cmp r3, #1
				152	; CHECK-NEXT: it lt
				153	; CHECK-NEXT: bxlt lr
				154	; CHECK-NEXT: vldr.16 s0, [r1]
				155	; CHECK-NEXT: vmov r1, s0
				156	; CHECK-NEXT: vdup.16 q0, r1
				157	; CHECK-NEXT: .LBB3_1: @ %vector.body
				158	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				159	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				160	; CHECK-NEXT: subs r3, #8
				161	; CHECK-NEXT: vmul.f16 q1, q0, q1
				162	; CHECK-NEXT: vstrb.8 q1, [r2], #16
				163	; CHECK-NEXT: bne .LBB3_1
				164	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				165	; CHECK-NEXT: bx lr
				166	entry:
				167	%B = load half, half* %BB
				168	%0 = and i32 %n, 7
				169	%cmp = icmp eq i32 %0, 0
				170	tail call void @llvm.assume(i1 %cmp)
				171	%cmp18 = icmp sgt i32 %n, 0
				172	br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
				173
				174	vector.ph: ; preds = %entry
				175	%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
				176	%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
				177	br label %vector.body
				178
				179	vector.body: ; preds = %vector.body, %vector.ph
				180	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				181	%1 = getelementptr inbounds half, half* %A, i32 %index
				182	%2 = bitcast half* %1 to <8 x half>*
				183	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				184	%3 = fmul fast <8 x half> %broadcast.splat11, %wide.load
				185	%4 = getelementptr inbounds half, half* %C, i32 %index
				186	%5 = bitcast half* %4 to <8 x half>*
				187	store <8 x half> %3, <8 x half>* %5, align 4
				188	%index.next = add i32 %index, 8
				189	%6 = icmp eq i32 %index.next, %n
				190	br i1 %6, label %for.cond.cleanup, label %vector.body
				191
				192	for.cond.cleanup: ; preds = %vector.body, %entry
				193	ret void
				194	}
				195
				196	define arm_aapcs_vfpcc void @test_fsub(half* noalias nocapture readonly %A, half %BB, half noalias nocapture %C, i32 %n) {
				197	; CHECK-LABEL: test_fsub:
				198	; CHECK: @ %bb.0: @ %entry
				199	; CHECK-NEXT: cmp r3, #1
				200	; CHECK-NEXT: it lt
				201	; CHECK-NEXT: bxlt lr
				202	; CHECK-NEXT: vldr.16 s0, [r1]
				203	; CHECK-NEXT: vmov r1, s0
				204	; CHECK-NEXT: vdup.16 q0, r1
				205	; CHECK-NEXT: .LBB4_1: @ %vector.body
				206	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				207	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				208	; CHECK-NEXT: subs r3, #8
				209	; CHECK-NEXT: vsub.f16 q1, q1, q0
				210	; CHECK-NEXT: vstrb.8 q1, [r2], #16
				211	; CHECK-NEXT: bne .LBB4_1
				212	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				213	; CHECK-NEXT: bx lr
				214	entry:
				215	%B = load half, half* %BB
				216	%0 = and i32 %n, 7
				217	%cmp = icmp eq i32 %0, 0
				218	tail call void @llvm.assume(i1 %cmp)
				219	%cmp18 = icmp sgt i32 %n, 0
				220	br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
				221
				222	vector.ph: ; preds = %entry
				223	%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
				224	%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
				225	br label %vector.body
				226
				227	vector.body: ; preds = %vector.body, %vector.ph
				228	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				229	%1 = getelementptr inbounds half, half* %A, i32 %index
				230	%2 = bitcast half* %1 to <8 x half>*
				231	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				232	%3 = fsub fast <8 x half> %wide.load, %broadcast.splat11
				233	%4 = getelementptr inbounds half, half* %C, i32 %index
				234	%5 = bitcast half* %4 to <8 x half>*
				235	store <8 x half> %3, <8 x half>* %5, align 4
				236	%index.next = add i32 %index, 8
				237	%6 = icmp eq i32 %index.next, %n
				238	br i1 %6, label %for.cond.cleanup, label %vector.body
				239
				240	for.cond.cleanup: ; preds = %vector.body, %entry
				241	ret void
				242	}
				243
				244	define arm_aapcs_vfpcc void @test_fsub_r(half* noalias nocapture readonly %A, half %BB, half noalias nocapture %C, i32 %n) {
				245	; CHECK-LABEL: test_fsub_r:
				246	; CHECK: @ %bb.0: @ %entry
				247	; CHECK-NEXT: cmp r3, #1
				248	; CHECK-NEXT: it lt
				249	; CHECK-NEXT: bxlt lr
				250	; CHECK-NEXT: vldr.16 s0, [r1]
				251	; CHECK-NEXT: vmov r1, s0
				252	; CHECK-NEXT: vdup.16 q0, r1
				253	; CHECK-NEXT: .LBB5_1: @ %vector.body
				254	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				255	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				256	; CHECK-NEXT: subs r3, #8
				257	; CHECK-NEXT: vsub.f16 q1, q0, q1
				258	; CHECK-NEXT: vstrb.8 q1, [r2], #16
				259	; CHECK-NEXT: bne .LBB5_1
				260	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				261	; CHECK-NEXT: bx lr
				262	entry:
				263	%B = load half, half* %BB
				264	%0 = and i32 %n, 7
				265	%cmp = icmp eq i32 %0, 0
				266	tail call void @llvm.assume(i1 %cmp)
				267	%cmp18 = icmp sgt i32 %n, 0
				268	br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
				269
				270	vector.ph: ; preds = %entry
				271	%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
				272	%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
				273	br label %vector.body
				274
				275	vector.body: ; preds = %vector.body, %vector.ph
				276	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				277	%1 = getelementptr inbounds half, half* %A, i32 %index
				278	%2 = bitcast half* %1 to <8 x half>*
				279	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				280	%3 = fsub fast <8 x half> %broadcast.splat11, %wide.load
				281	%4 = getelementptr inbounds half, half* %C, i32 %index
				282	%5 = bitcast half* %4 to <8 x half>*
				283	store <8 x half> %3, <8 x half>* %5, align 4
				284	%index.next = add i32 %index, 8
				285	%6 = icmp eq i32 %index.next, %n
				286	br i1 %6, label %for.cond.cleanup, label %vector.body
				287
				288	for.cond.cleanup: ; preds = %vector.body, %entry
				289	ret void
				290	}
				291
				292
				293	define arm_aapcs_vfpcc void @test_fmas(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %CC, half noalias nocapture %D, i32 %n) {
				294	; CHECK-LABEL: test_fmas:
				295	; CHECK: @ %bb.0: @ %entry
				296	; CHECK-NEXT: ldr.w r12, [sp]
				297	; CHECK-NEXT: cmp.w r12, #1
				298	; CHECK-NEXT: it lt
				299	; CHECK-NEXT: bxlt lr
				300	; CHECK-NEXT: vldr.16 s0, [r2]
				301	; CHECK-NEXT: vmov r2, s0
				302	; CHECK-NEXT: vdup.16 q0, r2
				303	; CHECK-NEXT: .LBB6_1: @ %vector.body
				304	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				305	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				306	; CHECK-NEXT: vldrw.u32 q2, [r1], #16
				307	; CHECK-NEXT: vmov q3, q0
				308	; CHECK-NEXT: subs.w r12, r12, #8
				309	; CHECK-NEXT: vfma.f16 q3, q2, q1
				310	; CHECK-NEXT: vstrb.8 q3, [r3], #16
				311	; CHECK-NEXT: bne .LBB6_1
				312	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				313	; CHECK-NEXT: bx lr
				314	entry:
				315	%C = load half, half* %CC
				316	%0 = and i32 %n, 7
				317	%cmp = icmp eq i32 %0, 0
				318	tail call void @llvm.assume(i1 %cmp)
				319	%cmp110 = icmp sgt i32 %n, 0
				320	br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
				321
				322	vector.ph: ; preds = %entry
				323	%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
				324	%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
				325	br label %vector.body
				326
				327	vector.body: ; preds = %vector.body, %vector.ph
				328	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				329	%1 = getelementptr inbounds half, half* %A, i32 %index
				330	%2 = bitcast half* %1 to <8 x half>*
				331	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				332	%3 = getelementptr inbounds half, half* %B, i32 %index
				333	%4 = bitcast half* %3 to <8 x half>*
				334	%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
				335	%5 = fmul fast <8 x half> %wide.load12, %wide.load
				336	%6 = fadd fast <8 x half> %5, %broadcast.splat14
				337	%7 = getelementptr inbounds half, half* %D, i32 %index
				338	%8 = bitcast half* %7 to <8 x half>*
				339	store <8 x half> %6, <8 x half>* %8, align 4
				340	%index.next = add i32 %index, 8
				341	%9 = icmp eq i32 %index.next, %n
				342	br i1 %9, label %for.cond.cleanup, label %vector.body
				343
				344	for.cond.cleanup: ; preds = %vector.body, %entry
				345	ret void
				346	}
				347
				348	define arm_aapcs_vfpcc void @test_fmas_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %CC, half noalias nocapture %D, i32 %n) {
				349	; CHECK-LABEL: test_fmas_r:
				350	; CHECK: @ %bb.0: @ %entry
				351	; CHECK-NEXT: ldr.w r12, [sp]
				352	; CHECK-NEXT: cmp.w r12, #1
				353	; CHECK-NEXT: it lt
				354	; CHECK-NEXT: bxlt lr
				355	; CHECK-NEXT: vldr.16 s0, [r2]
				356	; CHECK-NEXT: vmov r2, s0
				357	; CHECK-NEXT: vdup.16 q0, r2
				358	; CHECK-NEXT: .LBB7_1: @ %vector.body
				359	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				360	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				361	; CHECK-NEXT: vldrw.u32 q2, [r1], #16
				362	; CHECK-NEXT: vmov q3, q0
				363	; CHECK-NEXT: subs.w r12, r12, #8
				364	; CHECK-NEXT: vfma.f16 q3, q2, q1
				365	; CHECK-NEXT: vstrb.8 q3, [r3], #16
				366	; CHECK-NEXT: bne .LBB7_1
				367	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				368	; CHECK-NEXT: bx lr
				369	entry:
				370	%C = load half, half* %CC
				371	%0 = and i32 %n, 7
				372	%cmp = icmp eq i32 %0, 0
				373	tail call void @llvm.assume(i1 %cmp)
				374	%cmp110 = icmp sgt i32 %n, 0
				375	br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
				376
				377	vector.ph: ; preds = %entry
				378	%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
				379	%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
				380	br label %vector.body
				381
				382	vector.body: ; preds = %vector.body, %vector.ph
				383	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				384	%1 = getelementptr inbounds half, half* %A, i32 %index
				385	%2 = bitcast half* %1 to <8 x half>*
				386	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				387	%3 = getelementptr inbounds half, half* %B, i32 %index
				388	%4 = bitcast half* %3 to <8 x half>*
				389	%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
				390	%5 = fmul fast <8 x half> %wide.load12, %wide.load
				391	%6 = fadd fast <8 x half> %broadcast.splat14, %5
				392	%7 = getelementptr inbounds half, half* %D, i32 %index
				393	%8 = bitcast half* %7 to <8 x half>*
				394	store <8 x half> %6, <8 x half>* %8, align 4
				395	%index.next = add i32 %index, 8
				396	%9 = icmp eq i32 %index.next, %n
				397	br i1 %9, label %for.cond.cleanup, label %vector.body
				398
				399	for.cond.cleanup: ; preds = %vector.body, %entry
				400	ret void
				401	}
				402
				403	define arm_aapcs_vfpcc void @test_fma(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %CC, half noalias nocapture %D, i32 %n) {
				404	; CHECK-LABEL: test_fma:
				405	; CHECK: @ %bb.0: @ %entry
				406	; CHECK-NEXT: ldr.w r12, [sp]
				407	; CHECK-NEXT: cmp.w r12, #1
				408	; CHECK-NEXT: it lt
				409	; CHECK-NEXT: bxlt lr
				410	; CHECK-NEXT: vldr.16 s0, [r2]
				411	; CHECK-NEXT: vmov r2, s0
				412	; CHECK-NEXT: vdup.16 q0, r2
				413	; CHECK-NEXT: .LBB8_1: @ %vector.body
				414	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				415	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				416	; CHECK-NEXT: vldrw.u32 q2, [r1], #16
				417	; CHECK-NEXT: subs.w r12, r12, #8
				418	; CHECK-NEXT: vfma.f16 q2, q1, q0
				419	; CHECK-NEXT: vstrb.8 q2, [r3], #16
				420	; CHECK-NEXT: bne .LBB8_1
				421	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				422	; CHECK-NEXT: bx lr
				423	entry:
				424	%C = load half, half* %CC
				425	%0 = and i32 %n, 7
				426	%cmp = icmp eq i32 %0, 0
				427	tail call void @llvm.assume(i1 %cmp)
				428	%cmp110 = icmp sgt i32 %n, 0
				429	br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
				430
				431	vector.ph: ; preds = %entry
				432	%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
				433	%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
				434	br label %vector.body
				435
				436	vector.body: ; preds = %vector.body, %vector.ph
				437	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				438	%1 = getelementptr inbounds half, half* %A, i32 %index
				439	%2 = bitcast half* %1 to <8 x half>*
				440	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				441	%3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
				442	%4 = getelementptr inbounds half, half* %B, i32 %index
				443	%5 = bitcast half* %4 to <8 x half>*
				444	%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
				445	%6 = fadd fast <8 x half> %3, %wide.load14
				446	%7 = getelementptr inbounds half, half* %D, i32 %index
				447	%8 = bitcast half* %7 to <8 x half>*
				448	store <8 x half> %6, <8 x half>* %8, align 4
				449	%index.next = add i32 %index, 8
				450	%9 = icmp eq i32 %index.next, %n
				451	br i1 %9, label %for.cond.cleanup, label %vector.body
				452
				453	for.cond.cleanup: ; preds = %vector.body, %entry
				454	ret void
				455	}
				456
				457	define arm_aapcs_vfpcc void @test_fma_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %CC, half noalias nocapture %D, i32 %n) {
				458	; CHECK-LABEL: test_fma_r:
				459	; CHECK: @ %bb.0: @ %entry
				460	; CHECK-NEXT: ldr.w r12, [sp]
				461	; CHECK-NEXT: cmp.w r12, #1
				462	; CHECK-NEXT: it lt
				463	; CHECK-NEXT: bxlt lr
				464	; CHECK-NEXT: vldr.16 s0, [r2]
				465	; CHECK-NEXT: vmov r2, s0
				466	; CHECK-NEXT: vdup.16 q0, r2
				467	; CHECK-NEXT: .LBB9_1: @ %vector.body
				468	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				469	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				470	; CHECK-NEXT: vldrw.u32 q2, [r1], #16
				471	; CHECK-NEXT: subs.w r12, r12, #8
				472	; CHECK-NEXT: vfma.f16 q2, q0, q1
				473	; CHECK-NEXT: vstrb.8 q2, [r3], #16
				474	; CHECK-NEXT: bne .LBB9_1
				475	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				476	; CHECK-NEXT: bx lr
				477	entry:
				478	%C = load half, half* %CC
				479	%0 = and i32 %n, 7
				480	%cmp = icmp eq i32 %0, 0
				481	tail call void @llvm.assume(i1 %cmp)
				482	%cmp110 = icmp sgt i32 %n, 0
				483	br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
				484
				485	vector.ph: ; preds = %entry
				486	%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
				487	%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
				488	br label %vector.body
				489
				490	vector.body: ; preds = %vector.body, %vector.ph
				491	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				492	%1 = getelementptr inbounds half, half* %A, i32 %index
				493	%2 = bitcast half* %1 to <8 x half>*
				494	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				495	%3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
				496	%4 = getelementptr inbounds half, half* %B, i32 %index
				497	%5 = bitcast half* %4 to <8 x half>*
				498	%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
				499	%6 = fadd fast <8 x half> %3, %wide.load14
				500	%7 = getelementptr inbounds half, half* %D, i32 %index
				501	%8 = bitcast half* %7 to <8 x half>*
				502	store <8 x half> %6, <8 x half>* %8, align 4
				503	%index.next = add i32 %index, 8
				504	%9 = icmp eq i32 %index.next, %n
				505	br i1 %9, label %for.cond.cleanup, label %vector.body
				506
				507	for.cond.cleanup: ; preds = %vector.body, %entry
				508	ret void
				509	}
				510
				511
				512	define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %CC, half noalias nocapture %D, i32 %n) {
				513	; CHECK-LABEL: test_fmss:
				514	; CHECK: @ %bb.0: @ %entry
				515	; CHECK-NEXT: ldr.w r12, [sp]
				516	; CHECK-NEXT: cmp.w r12, #1
				517	; CHECK-NEXT: it lt
				518	; CHECK-NEXT: bxlt lr
				519	; CHECK-NEXT: vldr.16 s0, [r2]
				520	; CHECK-NEXT: vmov r2, s0
				521	; CHECK-NEXT: vdup.16 q0, r2
				522	; CHECK-NEXT: vneg.f16 q0, q0
				523	; CHECK-NEXT: .LBB10_1: @ %vector.body
				524	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				525	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				526	; CHECK-NEXT: vldrw.u32 q2, [r1], #16
				527	; CHECK-NEXT: vmov q3, q0
				528	; CHECK-NEXT: subs.w r12, r12, #8
				529	; CHECK-NEXT: vfma.f16 q3, q2, q1
				530	; CHECK-NEXT: vstrb.8 q3, [r3], #16
				531	; CHECK-NEXT: bne .LBB10_1
				532	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				533	; CHECK-NEXT: bx lr
				534	entry:
				535	%C = load half, half* %CC
				536	%0 = and i32 %n, 7
				537	%cmp = icmp eq i32 %0, 0
				538	tail call void @llvm.assume(i1 %cmp)
				539	%cmp110 = icmp sgt i32 %n, 0
				540	br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
				541
				542	vector.ph: ; preds = %entry
				543	%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
				544	%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
				545	br label %vector.body
				546
				547	vector.body: ; preds = %vector.body, %vector.ph
				548	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				549	%1 = getelementptr inbounds half, half* %A, i32 %index
				550	%2 = bitcast half* %1 to <8 x half>*
				551	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				552	%3 = getelementptr inbounds half, half* %B, i32 %index
				553	%4 = bitcast half* %3 to <8 x half>*
				554	%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
				555	%5 = fmul fast <8 x half> %wide.load12, %wide.load
				556	%6 = fsub fast <8 x half> %5, %broadcast.splat14
				557	%7 = getelementptr inbounds half, half* %D, i32 %index
				558	%8 = bitcast half* %7 to <8 x half>*
				559	store <8 x half> %6, <8 x half>* %8, align 4
				560	%index.next = add i32 %index, 8
				561	%9 = icmp eq i32 %index.next, %n
				562	br i1 %9, label %for.cond.cleanup, label %vector.body
				563
				564	for.cond.cleanup: ; preds = %vector.body, %entry
				565	ret void
				566	}
				567
				568	define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %CC, half noalias nocapture %D, i32 %n) {
				569	; CHECK-LABEL: test_fmss_r:
				570	; CHECK: @ %bb.0: @ %entry
				571	; CHECK-NEXT: ldr.w r12, [sp]
				572	; CHECK-NEXT: cmp.w r12, #1
				573	; CHECK-NEXT: it lt
				574	; CHECK-NEXT: bxlt lr
				575	; CHECK-NEXT: vldr.16 s0, [r2]
				576	; CHECK-NEXT: vmov r2, s0
				577	; CHECK-NEXT: vdup.16 q0, r2
				578	; CHECK-NEXT: .LBB11_1: @ %vector.body
				579	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				580	; CHECK-NEXT: vldrw.u32 q1, [r0], #16
				581	; CHECK-NEXT: vldrw.u32 q2, [r1], #16
				582	; CHECK-NEXT: vmov q3, q0
				583	; CHECK-NEXT: subs.w r12, r12, #8
				584	; CHECK-NEXT: vfms.f16 q3, q2, q1
				585	; CHECK-NEXT: vstrb.8 q3, [r3], #16
				586	; CHECK-NEXT: bne .LBB11_1
				587	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				588	; CHECK-NEXT: bx lr
				589	entry:
				590	%C = load half, half* %CC
				591	%0 = and i32 %n, 7
				592	%cmp = icmp eq i32 %0, 0
				593	tail call void @llvm.assume(i1 %cmp)
				594	%cmp110 = icmp sgt i32 %n, 0
				595	br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
				596
				597	vector.ph: ; preds = %entry
				598	%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
				599	%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
				600	br label %vector.body
				601
				602	vector.body: ; preds = %vector.body, %vector.ph
				603	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				604	%1 = getelementptr inbounds half, half* %A, i32 %index
				605	%2 = bitcast half* %1 to <8 x half>*
				606	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				607	%3 = getelementptr inbounds half, half* %B, i32 %index
				608	%4 = bitcast half* %3 to <8 x half>*
				609	%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
				610	%5 = fmul fast <8 x half> %wide.load12, %wide.load
				611	%6 = fsub fast <8 x half> %broadcast.splat14, %5
				612	%7 = getelementptr inbounds half, half* %D, i32 %index
				613	%8 = bitcast half* %7 to <8 x half>*
				614	store <8 x half> %6, <8 x half>* %8, align 4
				615	%index.next = add i32 %index, 8
				616	%9 = icmp eq i32 %index.next, %n
				617	br i1 %9, label %for.cond.cleanup, label %vector.body
				618
				619	for.cond.cleanup: ; preds = %vector.body, %entry
				620	ret void
				621	}
				622
				623	define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %CC, half noalias nocapture %D, i32 %n) {
				624	; CHECK-LABEL: test_fms:
				625	; CHECK: @ %bb.0: @ %entry
				626	; CHECK-NEXT: ldr.w r12, [sp]
				627	; CHECK-NEXT: cmp.w r12, #1
				628	; CHECK-NEXT: it lt
				629	; CHECK-NEXT: bxlt lr
				630	; CHECK-NEXT: vldr.16 s0, [r2]
				631	; CHECK-NEXT: vmov r2, s0
				632	; CHECK-NEXT: vdup.16 q0, r2
				633	; CHECK-NEXT: .LBB12_1: @ %vector.body
				634	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				635	; CHECK-NEXT: vldrw.u32 q1, [r1], #16
				636	; CHECK-NEXT: vldrw.u32 q2, [r0], #16
				637	; CHECK-NEXT: subs.w r12, r12, #8
				638	; CHECK-NEXT: vneg.f16 q1, q1
				639	; CHECK-NEXT: vfma.f16 q1, q2, q0
				640	; CHECK-NEXT: vstrb.8 q1, [r3], #16
				641	; CHECK-NEXT: bne .LBB12_1
				642	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				643	; CHECK-NEXT: bx lr
				644	entry:
				645	%C = load half, half* %CC
				646	%0 = and i32 %n, 7
				647	%cmp = icmp eq i32 %0, 0
				648	tail call void @llvm.assume(i1 %cmp)
				649	%cmp110 = icmp sgt i32 %n, 0
				650	br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
				651
				652	vector.ph: ; preds = %entry
				653	%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
				654	%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
				655	br label %vector.body
				656
				657	vector.body: ; preds = %vector.body, %vector.ph
				658	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				659	%1 = getelementptr inbounds half, half* %A, i32 %index
				660	%2 = bitcast half* %1 to <8 x half>*
				661	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				662	%3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
				663	%4 = getelementptr inbounds half, half* %B, i32 %index
				664	%5 = bitcast half* %4 to <8 x half>*
				665	%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
				666	%6 = fsub fast <8 x half> %3, %wide.load14
				667	%7 = getelementptr inbounds half, half* %D, i32 %index
				668	%8 = bitcast half* %7 to <8 x half>*
				669	store <8 x half> %6, <8 x half>* %8, align 4
				670	%index.next = add i32 %index, 8
				671	%9 = icmp eq i32 %index.next, %n
				672	br i1 %9, label %for.cond.cleanup, label %vector.body
				673
				674	for.cond.cleanup: ; preds = %vector.body, %entry
				675	ret void
				676	}
				677
				678	define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %CC, half noalias nocapture %D, i32 %n) {
				679	; CHECK-LABEL: test_fms_r:
				680	; CHECK: @ %bb.0: @ %entry
				681	; CHECK-NEXT: ldr.w r12, [sp]
				682	; CHECK-NEXT: cmp.w r12, #1
				683	; CHECK-NEXT: it lt
				684	; CHECK-NEXT: bxlt lr
				685	; CHECK-NEXT: vldr.16 s0, [r2]
				686	; CHECK-NEXT: vmov r2, s0
				687	; CHECK-NEXT: vdup.16 q0, r2
				688	; CHECK-NEXT: .LBB13_1: @ %vector.body
				689	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				690	; CHECK-NEXT: vldrw.u32 q1, [r1], #16
				691	; CHECK-NEXT: vldrw.u32 q2, [r0], #16
				692	; CHECK-NEXT: subs.w r12, r12, #8
				693	; CHECK-NEXT: vneg.f16 q1, q1
				694	; CHECK-NEXT: vfma.f16 q1, q0, q2
				695	; CHECK-NEXT: vstrb.8 q1, [r3], #16
				696	; CHECK-NEXT: bne .LBB13_1
				697	; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
				698	; CHECK-NEXT: bx lr
				699	entry:
				700	%C = load half, half* %CC
				701	%0 = and i32 %n, 7
				702	%cmp = icmp eq i32 %0, 0
				703	tail call void @llvm.assume(i1 %cmp)
				704	%cmp110 = icmp sgt i32 %n, 0
				705	br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
				706
				707	vector.ph: ; preds = %entry
				708	%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
				709	%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
				710	br label %vector.body
				711
				712	vector.body: ; preds = %vector.body, %vector.ph
				713	%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				714	%1 = getelementptr inbounds half, half* %A, i32 %index
				715	%2 = bitcast half* %1 to <8 x half>*
				716	%wide.load = load <8 x half>, <8 x half>* %2, align 4
				717	%3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
				718	%4 = getelementptr inbounds half, half* %B, i32 %index
				719	%5 = bitcast half* %4 to <8 x half>*
				720	%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
				721	%6 = fsub fast <8 x half> %3, %wide.load14
				722	%7 = getelementptr inbounds half, half* %D, i32 %index
				723	%8 = bitcast half* %7 to <8 x half>*
				724	store <8 x half> %6, <8 x half>* %8, align 4
				725	%index.next = add i32 %index, 8
				726	%9 = icmp eq i32 %index.next, %n
				727	br i1 %9, label %for.cond.cleanup, label %vector.body
				728
				729	for.cond.cleanup: ; preds = %vector.body, %entry
				730	ret void
				731	}
				732
				733
				734	define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, half *%ina) local_unnamed_addr #0 {
				735	; CHECK-LABEL: test_nested:
				736	; CHECK: @ %bb.0: @ %for.body.us.preheader
				737	; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
				738	; CHECK-NEXT: push {r4, r5, r6, r7, lr}
				739	; CHECK-NEXT: ldrd lr, r12, [sp, #20]
				740	; CHECK-NEXT: lsl.w r3, r12, #1
				741	; CHECK-NEXT: dls lr, lr
				742	; CHECK-NEXT: .LBB14_1: @ %for.body.us
				743	; CHECK-NEXT: @ =>This Loop Header: Depth=1
				744	; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
				745	; CHECK-NEXT: vldr.16 s0, [r1]
				746	; CHECK-NEXT: mov r5, r12
				747	; CHECK-NEXT: vmov r4, s0
				748	; CHECK-NEXT: vdup.16 q0, r4
				749	; CHECK-NEXT: movs r4, #0
				750	; CHECK-NEXT: .LBB14_2: @ %vector.body
				751	; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1
				752	; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
				753	; CHECK-NEXT: adds r6, r0, r4
				754	; CHECK-NEXT: adds r7, r2, r4
				755	; CHECK-NEXT: vldrw.u32 q1, [r7]
				756	; CHECK-NEXT: vldrw.u32 q2, [r6]
				757	; CHECK-NEXT: adds r4, #16
				758	; CHECK-NEXT: subs r5, #8
				759	; CHECK-NEXT: vfms.f16 q2, q1, q0
				760	; CHECK-NEXT: vstrw.32 q2, [r6]
				761	; CHECK-NEXT: bne .LBB14_2
				762	; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us
				763	; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1
				764	; CHECK-NEXT: add r0, r3
				765	; CHECK-NEXT: add r2, r3
				766	; CHECK-NEXT: adds r1, #2
				767	; CHECK-NEXT: le lr, .LBB14_1
				768	; CHECK-NEXT: @ %bb.4: @ %for.end14
				769	; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
				770	for.body.us.preheader:
				771	%in = load half, half* %ina
				772	%cmp = icmp sgt i32 %numRows, 0
				773	tail call void @llvm.assume(i1 %cmp)
				774	%cmp1 = icmp sgt i32 %numCols, 0
				775	tail call void @llvm.assume(i1 %cmp1)
				776	%rem = and i32 %numCols, 7
				777	%cmp2 = icmp eq i32 %rem, 0
				778	tail call void @llvm.assume(i1 %cmp2)
				779	%cmp3 = icmp slt i32 %l, %numCols
				780	tail call void @llvm.assume(i1 %cmp3)
				781	br label %for.body.us
				782
				783	for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
				784	%pInT1.addr.038.us = phi half* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
				785	%i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
				786	%pOutT1.addr.036.us = phi half* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
				787	%pPRT_in.addr.035.us = phi half* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
				788	%scevgep = getelementptr half, half* %pPRT_in.addr.035.us, i32 %numCols
				789	%0 = load half, half* %pOutT1.addr.036.us, align 4
				790	%broadcast.splatinsert47 = insertelement <8 x half> undef, half %0, i32 0
				791	%broadcast.splat48 = shufflevector <8 x half> %broadcast.splatinsert47, <8 x half> undef, <8 x i32> zeroinitializer
				792	br label %vector.body
				793
				794	vector.body: ; preds = %vector.body, %for.body.us
				795	%index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
				796	%next.gep = getelementptr half, half* %pInT1.addr.038.us, i32 %index
				797	%next.gep45 = getelementptr half, half* %pPRT_in.addr.035.us, i32 %index
				798	%1 = bitcast half* %next.gep to <8 x half>*
				799	%wide.load = load <8 x half>, <8 x half>* %1, align 4
				800	%2 = bitcast half* %next.gep45 to <8 x half>*
				801	%wide.load46 = load <8 x half>, <8 x half>* %2, align 4
				802	%3 = fmul fast <8 x half> %wide.load46, %broadcast.splat48
				803	%4 = fsub fast <8 x half> %wide.load, %3
				804	store <8 x half> %4, <8 x half>* %1, align 4
				805	%index.next = add i32 %index, 8
				806	%5 = icmp eq i32 %index.next, %numCols
				807	br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body
				808
				809	for.cond6.for.end_crit_edge.us: ; preds = %vector.body
				810	%incdec.ptr.us = getelementptr inbounds half, half* %pOutT1.addr.036.us, i32 1
				811	%scevgep40 = getelementptr half, half* %pInT1.addr.038.us, i32 %numCols
				812	%inc13.us = add nuw nsw i32 %i.037.us, 1
				813	%exitcond41 = icmp eq i32 %inc13.us, %numRows
				814	br i1 %exitcond41, label %for.end14, label %for.body.us
				815
				816	for.end14: ; preds = %for.cond6.for.end_crit_edge.us
				817	ret void
				818	}
				819
				820	%struct.arm_fir_instance_f32 = type { i16, half, half }
				821	define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* %pDst, i32 %blockSize) {
				822	; CHECK-LABEL: arm_fir_f32_1_4_mve:
				823	; CHECK: @ %bb.0: @ %entry
				824	; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
				825	; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
				826	; CHECK-NEXT: .vsave {d8, d9, d10, d11}
				827	; CHECK-NEXT: vpush {d8, d9, d10, d11}
				828	; CHECK-NEXT: ldrh.w r9, [r0]
				829	; CHECK-NEXT: ldr.w r12, [r0, #4]
				830	; CHECK-NEXT: sub.w r7, r9, #1
				831	; CHECK-NEXT: cmp r7, #3
				832	; CHECK-NEXT: bhi .LBB15_6
				833	; CHECK-NEXT: @ %bb.1: @ %if.then
				834	; CHECK-NEXT: ldr r6, [r0, #8]
				835	; CHECK-NEXT: vldr.16 s0, [r6]
				836	; CHECK-NEXT: vmov lr, s0
				837	; CHECK-NEXT: vldr.16 s0, [r6, #2]
				838	; CHECK-NEXT: vdup.16 q3, lr
				839	; CHECK-NEXT: lsr.w lr, r3, #2
				840	; CHECK-NEXT: vmov r5, s0
				841	; CHECK-NEXT: vldr.16 s0, [r6, #4]
				842	; CHECK-NEXT: vdup.16 q2, r5
				843	; CHECK-NEXT: vmov r4, s0
				844	; CHECK-NEXT: vldr.16 s0, [r6, #6]
				845	; CHECK-NEXT: vdup.16 q1, r4
				846	; CHECK-NEXT: add.w r4, r12, r7, lsl #1
				847	; CHECK-NEXT: vmov r6, s0
				848	; CHECK-NEXT: vdup.16 q0, r6
				849	; CHECK-NEXT: wls lr, lr, .LBB15_5
				850	; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
				851	; CHECK-NEXT: bic r10, r3, #3
				852	; CHECK-NEXT: movs r6, #0
				853	; CHECK-NEXT: add.w r8, r2, r10, lsl #1
				854	; CHECK-NEXT: .LBB15_3: @ %while.body
				855	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				856	; CHECK-NEXT: adds r5, r1, r6
				857	; CHECK-NEXT: vldrw.u32 q4, [r5]
				858	; CHECK-NEXT: adds r5, r4, r6
				859	; CHECK-NEXT: vstrw.32 q4, [r5]
				860	; CHECK-NEXT: add.w r5, r12, r6
				861	; CHECK-NEXT: vldrw.u32 q4, [r5]
				862	; CHECK-NEXT: adds r7, r5, #2
				863	; CHECK-NEXT: vldrw.u32 q5, [r7]
				864	; CHECK-NEXT: vmul.f16 q4, q4, q3
				865	; CHECK-NEXT: vfma.f16 q4, q5, q2
				866	; CHECK-NEXT: vldrw.u32 q5, [r5, #4]
				867	; CHECK-NEXT: adds r5, #6
				868	; CHECK-NEXT: vfma.f16 q4, q5, q1
				869	; CHECK-NEXT: vldrw.u32 q5, [r5]
				870	; CHECK-NEXT: adds r5, r2, r6
				871	; CHECK-NEXT: adds r6, #8
				872	; CHECK-NEXT: vfma.f16 q4, q5, q0
				873	; CHECK-NEXT: vstrw.32 q4, [r5]
				874	; CHECK-NEXT: le lr, .LBB15_3
				875	; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
				876	; CHECK-NEXT: add r4, r6
				877	; CHECK-NEXT: add.w r12, r12, r10, lsl #1
				878	; CHECK-NEXT: add.w r1, r1, r10, lsl #1
				879	; CHECK-NEXT: mov r2, r8
				880	; CHECK-NEXT: .LBB15_5: @ %while.end
				881	; CHECK-NEXT: and r7, r3, #3
				882	; CHECK-NEXT: vldrw.u32 q4, [r1]
				883	; CHECK-NEXT: vctp.16 r7
				884	; CHECK-NEXT: vpst
				885	; CHECK-NEXT: vstrht.16 q4, [r4]
				886	; CHECK-NEXT: vldrw.u32 q4, [r12]
				887	; CHECK-NEXT: add.w r1, r12, #2
				888	; CHECK-NEXT: vmul.f16 q3, q4, q3
				889	; CHECK-NEXT: vldrw.u32 q4, [r1]
				890	; CHECK-NEXT: add.w r1, r12, #6
				891	; CHECK-NEXT: vfma.f16 q3, q4, q2
				892	; CHECK-NEXT: vldrw.u32 q2, [r12, #4]
				893	; CHECK-NEXT: vfma.f16 q3, q2, q1
				894	; CHECK-NEXT: vldrw.u32 q1, [r1]
				895	; CHECK-NEXT: vfma.f16 q3, q1, q0
				896	; CHECK-NEXT: vpst
				897	; CHECK-NEXT: vstrht.16 q3, [r2]
				898	; CHECK-NEXT: ldr.w r12, [r0, #4]
				899	; CHECK-NEXT: .LBB15_6: @ %if.end
				900	; CHECK-NEXT: add.w r0, r12, r3, lsl #1
				901	; CHECK-NEXT: lsr.w lr, r9, #2
				902	; CHECK-NEXT: wls lr, lr, .LBB15_10
				903	; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
				904	; CHECK-NEXT: bic r2, r9, #3
				905	; CHECK-NEXT: adds r1, r2, r3
				906	; CHECK-NEXT: mov r3, r12
				907	; CHECK-NEXT: add.w r1, r12, r1, lsl #1
				908	; CHECK-NEXT: .LBB15_8: @ %while.body51
				909	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
				910	; CHECK-NEXT: vldrw.u32 q0, [r0], #8
				911	; CHECK-NEXT: vstrb.8 q0, [r3], #8
				912	; CHECK-NEXT: le lr, .LBB15_8
				913	; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit
				914	; CHECK-NEXT: add.w r12, r12, r2, lsl #1
				915	; CHECK-NEXT: mov r0, r1
				916	; CHECK-NEXT: .LBB15_10: @ %while.end55
				917	; CHECK-NEXT: ands r1, r9, #3
				918	; CHECK-NEXT: beq .LBB15_12
				919	; CHECK-NEXT: @ %bb.11: @ %if.then59
				920	; CHECK-NEXT: vldrw.u32 q0, [r0]
				921	; CHECK-NEXT: vctp.16 r1
				922	; CHECK-NEXT: vpst
				923	; CHECK-NEXT: vstrht.16 q0, [r12]
				924	; CHECK-NEXT: .LBB15_12: @ %if.end61
				925	; CHECK-NEXT: vpop {d8, d9, d10, d11}
				926	; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
				927	entry:
				928	%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
				929	%0 = load half, half* %pState1, align 4
				930	%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
				931	%1 = load half, half* %pCoeffs2, align 4
				932	%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
				933	%2 = load i16, i16* %numTaps3, align 4
				934	%conv = zext i16 %2 to i32
				935	%sub = add nsw i32 %conv, -1
				936	%cmp = icmp ult i32 %sub, 4
				937	br i1 %cmp, label %if.then, label %if.end
				938
				939	if.then: ; preds = %entry
				940	%arrayidx = getelementptr inbounds half, half* %0, i32 %sub
				941	%incdec.ptr = getelementptr inbounds half, half* %1, i32 1
				942	%3 = load half, half* %1, align 4
				943	%incdec.ptr6 = getelementptr inbounds half, half* %1, i32 2
				944	%4 = load half, half* %incdec.ptr, align 4
				945	%incdec.ptr7 = getelementptr inbounds half, half* %1, i32 3
				946	%5 = load half, half* %incdec.ptr6, align 4
				947	%6 = load half, half* %incdec.ptr7, align 4
				948	%shr = lshr i32 %blockSize, 2
				949	%cmp9146 = icmp eq i32 %shr, 0
				950	%.pre161 = insertelement <8 x half> undef, half %3, i32 0
				951	%.pre162 = shufflevector <8 x half> %.pre161, <8 x half> undef, <8 x i32> zeroinitializer
				952	%.pre163 = insertelement <8 x half> undef, half %4, i32 0
				953	%.pre164 = shufflevector <8 x half> %.pre163, <8 x half> undef, <8 x i32> zeroinitializer
				954	%.pre165 = insertelement <8 x half> undef, half %5, i32 0
				955	%.pre166 = shufflevector <8 x half> %.pre165, <8 x half> undef, <8 x i32> zeroinitializer
				956	%.pre167 = insertelement <8 x half> undef, half %6, i32 0
				957	%.pre168 = shufflevector <8 x half> %.pre167, <8 x half> undef, <8 x i32> zeroinitializer
				958	br i1 %cmp9146, label %while.end, label %while.body.lr.ph
				959
				960	while.body.lr.ph: ; preds = %if.then
				961	%7 = and i32 %blockSize, -4
				962	%scevgep158 = getelementptr half, half* %pDst, i32 %7
				963	br label %while.body
				964
				965	while.body: ; preds = %while.body.lr.ph, %while.body
				966	%pStateCur.0151 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
				967	%pSamples.0150 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
				968	%pOutput.0149 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
				969	%pTempSrc.0148 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
				970	%blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
				971	%8 = bitcast half* %pTempSrc.0148 to <8 x half>*
				972	%9 = load <8 x half>, <8 x half>* %8, align 4
				973	%10 = bitcast half* %pStateCur.0151 to <8 x half>*
				974	store <8 x half> %9, <8 x half>* %10, align 4
				975	%add.ptr = getelementptr inbounds half, half* %pStateCur.0151, i32 4
				976	%add.ptr11 = getelementptr inbounds half, half* %pTempSrc.0148, i32 4
				977	%11 = bitcast half* %pSamples.0150 to <8 x half>*
				978	%12 = load <8 x half>, <8 x half>* %11, align 4
				979	%13 = fmul fast <8 x half> %12, %.pre162
				980	%arrayidx12 = getelementptr inbounds half, half* %pSamples.0150, i32 1
				981	%14 = bitcast half* %arrayidx12 to <8 x half>*
				982	%15 = load <8 x half>, <8 x half>* %14, align 4
				983	%mul = fmul fast <8 x half> %15, %.pre164
				984	%add = fadd fast <8 x half> %mul, %13
				985	%arrayidx13 = getelementptr inbounds half, half* %pSamples.0150, i32 2
				986	%16 = bitcast half* %arrayidx13 to <8 x half>*
				987	%17 = load <8 x half>, <8 x half>* %16, align 4
				988	%mul16 = fmul fast <8 x half> %17, %.pre166
				989	%add17 = fadd fast <8 x half> %add, %mul16
				990	%arrayidx18 = getelementptr inbounds half, half* %pSamples.0150, i32 3
				991	%18 = bitcast half* %arrayidx18 to <8 x half>*
				992	%19 = load <8 x half>, <8 x half>* %18, align 4
				993	%mul21 = fmul fast <8 x half> %19, %.pre168
				994	%add22 = fadd fast <8 x half> %add17, %mul21
				995	%20 = bitcast half* %pOutput.0149 to <8 x half>*
				996	store <8 x half> %add22, <8 x half>* %20, align 4
				997	%add.ptr23 = getelementptr inbounds half, half* %pOutput.0149, i32 4
				998	%add.ptr24 = getelementptr inbounds half, half* %pSamples.0150, i32 4
				999	%dec = add nsw i32 %blkCnt.0147, -1
				1000	%cmp9 = icmp eq i32 %dec, 0
				1001	br i1 %cmp9, label %while.end.loopexit, label %while.body
				1002
				1003	while.end.loopexit: ; preds = %while.body
				1004	%scevgep157 = getelementptr half, half* %pSrc, i32 %7
				1005	%scevgep159 = getelementptr half, half* %0, i32 %7
				1006	br label %while.end
				1007
				1008	while.end: ; preds = %if.then, %while.end.loopexit
				1009	%pTempSrc.0.lcssa = phi half* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
				1010	%pOutput.0.lcssa = phi half* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
				1011	%pSamples.0.lcssa = phi half* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ]
				1012	%pStateCur.0.lcssa = phi half* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
				1013	%and = and i32 %blockSize, 3
				1014	%21 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and)
				1015	%22 = bitcast half* %pTempSrc.0.lcssa to <8 x half>*
				1016	%23 = load <8 x half>, <8 x half>* %22, align 4
				1017	%24 = bitcast half* %pStateCur.0.lcssa to <8 x half>*
				1018	tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %23, <8 x half>* %24, i32 4, <8 x i1> %21)
				1019	%25 = bitcast half* %pSamples.0.lcssa to <8 x half>*
				1020	%26 = load <8 x half>, <8 x half>* %25, align 4
				1021	%27 = fmul fast <8 x half> %26, %.pre162
				1022	%arrayidx29 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 1
				1023	%28 = bitcast half* %arrayidx29 to <8 x half>*
				1024	%29 = load <8 x half>, <8 x half>* %28, align 4
				1025	%mul32 = fmul fast <8 x half> %29, %.pre164
				1026	%add33 = fadd fast <8 x half> %mul32, %27
				1027	%arrayidx34 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 2
				1028	%30 = bitcast half* %arrayidx34 to <8 x half>*
				1029	%31 = load <8 x half>, <8 x half>* %30, align 4
				1030	%mul37 = fmul fast <8 x half> %31, %.pre166
				1031	%add38 = fadd fast <8 x half> %add33, %mul37
				1032	%arrayidx39 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 3
				1033	%32 = bitcast half* %arrayidx39 to <8 x half>*
				1034	%33 = load <8 x half>, <8 x half>* %32, align 4
				1035	%mul42 = fmul fast <8 x half> %33, %.pre168
				1036	%add43 = fadd fast <8 x half> %add38, %mul42
				1037	%34 = bitcast half* %pOutput.0.lcssa to <8 x half>*
				1038	tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %add43, <8 x half>* %34, i32 4, <8 x i1> %21)
				1039	%.pre = load half, half* %pState1, align 4
				1040	br label %if.end
				1041
				1042	if.end: ; preds = %while.end, %entry
				1043	%35 = phi half* [ %.pre, %while.end ], [ %0, %entry ]
				1044	%arrayidx45 = getelementptr inbounds half, half* %35, i32 %blockSize
				1045	%shr47 = lshr i32 %conv, 2
				1046	%cmp49141 = icmp eq i32 %shr47, 0
				1047	br i1 %cmp49141, label %while.end55, label %while.body51.preheader
				1048
				1049	while.body51.preheader: ; preds = %if.end
				1050	%36 = and i32 %conv, 65532
				1051	%37 = add i32 %36, %blockSize
				1052	%scevgep = getelementptr half, half* %35, i32 %37
				1053	br label %while.body51
				1054
				1055	while.body51: ; preds = %while.body51.preheader, %while.body51
				1056	%pTempSrc.1144 = phi half* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
				1057	%pTempDest.0143 = phi half* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ]
				1058	%blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
				1059	%38 = bitcast half* %pTempSrc.1144 to <8 x half>*
				1060	%39 = load <8 x half>, <8 x half>* %38, align 4
				1061	%40 = bitcast half* %pTempDest.0143 to <8 x half>*
				1062	store <8 x half> %39, <8 x half>* %40, align 4
				1063	%add.ptr52 = getelementptr inbounds half, half* %pTempSrc.1144, i32 4
				1064	%add.ptr53 = getelementptr inbounds half, half* %pTempDest.0143, i32 4
				1065	%dec54 = add nsw i32 %blkCnt.1142, -1
				1066	%cmp49 = icmp eq i32 %dec54, 0
				1067	br i1 %cmp49, label %while.end55.loopexit, label %while.body51
				1068
				1069	while.end55.loopexit: ; preds = %while.body51
				1070	%scevgep156 = getelementptr half, half* %35, i32 %36
				1071	br label %while.end55
				1072
				1073	while.end55: ; preds = %while.end55.loopexit, %if.end
				1074	%pTempDest.0.lcssa = phi half* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
				1075	%pTempSrc.1.lcssa = phi half* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
				1076	%and56 = and i32 %conv, 3
				1077	%cmp57 = icmp eq i32 %and56, 0
				1078	br i1 %cmp57, label %if.end61, label %if.then59
				1079
				1080	if.then59: ; preds = %while.end55
				1081	%41 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and56)
				1082	%42 = bitcast half* %pTempSrc.1.lcssa to <8 x half>*
				1083	%43 = load <8 x half>, <8 x half>* %42, align 4
				1084	%44 = bitcast half* %pTempDest.0.lcssa to <8 x half>*
				1085	tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %43, <8 x half>* %44, i32 4, <8 x i1> %41)
				1086	br label %if.end61
				1087
				1088	if.end61: ; preds = %while.end55, %if.then59
				1089	ret void
				1090	}
				1091
				1092
				1093	define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %blockSize) {
				1094	; CHECK-LABEL: fir:
				1095	; CHECK: @ %bb.0: @ %entry
				1096	; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
				1097	; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
				1098	; CHECK-NEXT: .pad #4
				1099	; CHECK-NEXT: sub sp, #4
				1100	; CHECK-NEXT: .vsave {d8, d9}
				1101	; CHECK-NEXT: vpush {d8, d9}
				1102	; CHECK-NEXT: .pad #16
				1103	; CHECK-NEXT: sub sp, #16
				1104	; CHECK-NEXT: cmp r3, #8
				1105	; CHECK-NEXT: blo.w .LBB16_12
				1106	; CHECK-NEXT: @ %bb.1: @ %if.then
				1107	; CHECK-NEXT: movs r7, #0
				1108	; CHECK-NEXT: cmp.w r7, r3, lsr #2
				1109	; CHECK-NEXT: beq.w .LBB16_12
				1110	; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
				1111	; CHECK-NEXT: ldrh.w r11, [r0]
				1112	; CHECK-NEXT: mov.w r8, #1
				1113	; CHECK-NEXT: ldrd r5, r12, [r0, #4]
				1114	; CHECK-NEXT: lsrs r3, r3, #2
				1115	; CHECK-NEXT: sub.w r0, r11, #8
				1116	; CHECK-NEXT: and r10, r0, #7
				1117	; CHECK-NEXT: add.w r7, r0, r0, lsr #29
				1118	; CHECK-NEXT: add.w r0, r10, #1
				1119	; CHECK-NEXT: asrs r6, r7, #3
				1120	; CHECK-NEXT: cmp r6, #1
				1121	; CHECK-NEXT: it gt
				1122	; CHECK-NEXT: asrgt.w r8, r7, #3
				1123	; CHECK-NEXT: add.w r7, r5, r11, lsl #1
				1124	; CHECK-NEXT: subs r4, r7, #2
				1125	; CHECK-NEXT: rsb.w r7, r11, #0
				1126	; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill
				1127	; CHECK-NEXT: add.w r7, r12, #16
				1128	; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
				1129	; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
				1130	; CHECK-NEXT: b .LBB16_4
				1131	; CHECK-NEXT: .LBB16_3: @ %while.end
				1132	; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
				1133	; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
				1134	; CHECK-NEXT: subs r3, #1
				1135	; CHECK-NEXT: vstrb.8 q0, [r2], #8
				1136	; CHECK-NEXT: add.w r0, r9, r0, lsl #1
				1137	; CHECK-NEXT: add.w r5, r0, #8
				1138	; CHECK-NEXT: beq.w .LBB16_12
				1139	; CHECK-NEXT: .LBB16_4: @ %while.body
				1140	; CHECK-NEXT: @ =>This Loop Header: Depth=1
				1141	; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
				1142	; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
				1143	; CHECK-NEXT: vldrw.u32 q0, [r1], #8
				1144	; CHECK-NEXT: vldr.16 s7, [r12]
				1145	; CHECK-NEXT: vldr.16 s4, [r12, #14]
				1146	; CHECK-NEXT: vldr.16 s6, [r12, #12]
				1147	; CHECK-NEXT: vldr.16 s8, [r12, #10]
				1148	; CHECK-NEXT: vldr.16 s10, [r12, #8]
				1149	; CHECK-NEXT: vldr.16 s12, [r12, #6]
				1150	; CHECK-NEXT: vldr.16 s14, [r12, #4]
				1151	; CHECK-NEXT: vldr.16 s5, [r12, #2]
				1152	; CHECK-NEXT: vstrb.8 q0, [r4], #8
				1153	; CHECK-NEXT: vldrw.u32 q0, [r5]
				1154	; CHECK-NEXT: vmov r0, s7
				1155	; CHECK-NEXT: adds r6, r5, #2
				1156	; CHECK-NEXT: add.w r9, r5, #16
				1157	; CHECK-NEXT: vmul.f16 q0, q0, r0
				1158	; CHECK-NEXT: vldrw.u32 q4, [r6]
				1159	; CHECK-NEXT: vmov r0, s5
				1160	; CHECK-NEXT: adds r6, r5, #6
				1161	; CHECK-NEXT: vfma.f16 q0, q4, r0
				1162	; CHECK-NEXT: vldrw.u32 q4, [r5, #4]
				1163	; CHECK-NEXT: vmov r0, s14
				1164	; CHECK-NEXT: cmp.w r11, #16
				1165	; CHECK-NEXT: vfma.f16 q0, q4, r0
				1166	; CHECK-NEXT: vmov r0, s12
				1167	; CHECK-NEXT: vldrw.u32 q3, [r6]
				1168	; CHECK-NEXT: add.w r6, r5, #10
				1169	; CHECK-NEXT: vfma.f16 q0, q3, r0
				1170	; CHECK-NEXT: vldrw.u32 q3, [r5, #8]
				1171	; CHECK-NEXT: vmov r0, s10
				1172	; CHECK-NEXT: vfma.f16 q0, q3, r0
				1173	; CHECK-NEXT: vmov r0, s8
				1174	; CHECK-NEXT: vldrw.u32 q2, [r6]
				1175	; CHECK-NEXT: add.w r6, r5, #14
				1176	; CHECK-NEXT: vfma.f16 q0, q2, r0
				1177	; CHECK-NEXT: vldrw.u32 q2, [r5, #12]
				1178	; CHECK-NEXT: vmov r0, s6
				1179	; CHECK-NEXT: vfma.f16 q0, q2, r0
				1180	; CHECK-NEXT: vmov r0, s4
				1181	; CHECK-NEXT: vldrw.u32 q1, [r6]
				1182	; CHECK-NEXT: vfma.f16 q0, q1, r0
				1183	; CHECK-NEXT: blo .LBB16_8
				1184	; CHECK-NEXT: @ %bb.5: @ %for.body.preheader
				1185	; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
				1186	; CHECK-NEXT: dls lr, r8
				1187	; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
				1188	; CHECK-NEXT: .LBB16_6: @ %for.body
				1189	; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
				1190	; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
				1191	; CHECK-NEXT: vldr.16 s4, [r6]
				1192	; CHECK-NEXT: add.w r5, r9, #2
				1193	; CHECK-NEXT: vmov r0, s4
				1194	; CHECK-NEXT: vldrw.u32 q1, [r9]
				1195	; CHECK-NEXT: vfma.f16 q0, q1, r0
				1196	; CHECK-NEXT: vldr.16 s4, [r6, #2]
				1197	; CHECK-NEXT: vmov r0, s4
				1198	; CHECK-NEXT: vldrw.u32 q1, [r5]
				1199	; CHECK-NEXT: add.w r5, r9, #6
				1200	; CHECK-NEXT: vfma.f16 q0, q1, r0
				1201	; CHECK-NEXT: vldr.16 s4, [r6, #4]
				1202	; CHECK-NEXT: vmov r0, s4
				1203	; CHECK-NEXT: vldrw.u32 q1, [r9, #4]
				1204	; CHECK-NEXT: vfma.f16 q0, q1, r0
				1205	; CHECK-NEXT: vldr.16 s4, [r6, #6]
				1206	; CHECK-NEXT: vmov r0, s4
				1207	; CHECK-NEXT: vldrw.u32 q1, [r5]
				1208	; CHECK-NEXT: add.w r5, r9, #10
				1209	; CHECK-NEXT: vfma.f16 q0, q1, r0
				1210	; CHECK-NEXT: vldr.16 s4, [r6, #8]
				1211	; CHECK-NEXT: vmov r0, s4
				1212	; CHECK-NEXT: vldrw.u32 q1, [r9, #8]
				1213	; CHECK-NEXT: vfma.f16 q0, q1, r0
				1214	; CHECK-NEXT: vldr.16 s4, [r6, #10]
				1215	; CHECK-NEXT: vmov r0, s4
				1216	; CHECK-NEXT: vldrw.u32 q1, [r5]
				1217	; CHECK-NEXT: add.w r5, r9, #14
				1218	; CHECK-NEXT: vfma.f16 q0, q1, r0
				1219	; CHECK-NEXT: vldr.16 s4, [r6, #12]
				1220	; CHECK-NEXT: vmov r0, s4
				1221	; CHECK-NEXT: vldrw.u32 q1, [r9, #12]
				1222	; CHECK-NEXT: add.w r9, r9, #16
				1223	; CHECK-NEXT: vfma.f16 q0, q1, r0
				1224	; CHECK-NEXT: vldr.16 s4, [r6, #14]
				1225	; CHECK-NEXT: adds r6, #16
				1226	; CHECK-NEXT: vmov r0, s4
				1227	; CHECK-NEXT: vldrw.u32 q1, [r5]
				1228	; CHECK-NEXT: vfma.f16 q0, q1, r0
				1229	; CHECK-NEXT: le lr, .LBB16_6
				1230	; CHECK-NEXT: @ %bb.7: @ %for.end
				1231	; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
				1232	; CHECK-NEXT: cmp.w r10, #0
				1233	; CHECK-NEXT: bne .LBB16_9
				1234	; CHECK-NEXT: b .LBB16_3
				1235	; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_4 Depth=1
				1236	; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
				1237	; CHECK-NEXT: cmp.w r10, #0
				1238	; CHECK-NEXT: beq.w .LBB16_3
				1239	; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
				1240	; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
				1241	; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
				1242	; CHECK-NEXT: mov r5, r9
				1243	; CHECK-NEXT: .LBB16_10: @ %while.body76
				1244	; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
				1245	; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
				1246	; CHECK-NEXT: vldr.16 s4, [r6]
				1247	; CHECK-NEXT: subs r0, #1
				1248	; CHECK-NEXT: adds r6, #2
				1249	; CHECK-NEXT: cmp r0, #1
				1250	; CHECK-NEXT: vmov r7, s4
				1251	; CHECK-NEXT: vldrh.u16 q1, [r5], #2
				1252	; CHECK-NEXT: vfma.f16 q0, q1, r7
				1253	; CHECK-NEXT: bgt .LBB16_10
				1254	; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
				1255	; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
				1256	; CHECK-NEXT: add.w r9, r9, r10, lsl #1
				1257	; CHECK-NEXT: b .LBB16_3
				1258	; CHECK-NEXT: .LBB16_12: @ %if.end
				1259	; CHECK-NEXT: add sp, #16
				1260	; CHECK-NEXT: vpop {d8, d9}
				1261	; CHECK-NEXT: add sp, #4
				1262	; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
				1263	entry:
				1264	%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
				1265	%0 = load half, half* %pState1, align 4
				1266	%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
				1267	%1 = load half, half* %pCoeffs2, align 4
				1268	%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
				1269	%2 = load i16, i16* %numTaps3, align 4
				1270	%conv = zext i16 %2 to i32
				1271	%cmp = icmp ugt i32 %blockSize, 7
				1272	br i1 %cmp, label %if.then, label %if.end
				1273
				1274	if.then: ; preds = %entry
				1275	%shr = lshr i32 %blockSize, 2
				1276	%cmp5217 = icmp eq i32 %shr, 0
				1277	br i1 %cmp5217, label %if.end, label %while.body.lr.ph
				1278
				1279	while.body.lr.ph: ; preds = %if.then
				1280	%sub = add nsw i32 %conv, -1
				1281	%arrayidx = getelementptr inbounds half, half* %0, i32 %sub
				1282	%incdec.ptr = getelementptr inbounds half, half* %1, i32 1
				1283	%incdec.ptr7 = getelementptr inbounds half, half* %1, i32 2
				1284	%incdec.ptr8 = getelementptr inbounds half, half* %1, i32 3
				1285	%incdec.ptr9 = getelementptr inbounds half, half* %1, i32 4
				1286	%incdec.ptr10 = getelementptr inbounds half, half* %1, i32 5
				1287	%incdec.ptr11 = getelementptr inbounds half, half* %1, i32 6
				1288	%incdec.ptr12 = getelementptr inbounds half, half* %1, i32 7
				1289	%sub37 = add nsw i32 %conv, -8
				1290	%div = sdiv i32 %sub37, 8
				1291	%pCoeffsCur.0199 = getelementptr inbounds half, half* %1, i32 8
				1292	%cmp38201 = icmp ugt i16 %2, 15
				1293	%and = and i32 %sub37, 7
				1294	%cmp74210 = icmp eq i32 %and, 0
				1295	%idx.neg = sub nsw i32 0, %conv
				1296	%3 = icmp sgt i32 %div, 1
				1297	%smax = select i1 %3, i32 %div, i32 1
				1298	br label %while.body
				1299
				1300	while.body: ; preds = %while.body.lr.ph, %while.end
				1301	%blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
				1302	%pStateCur.0221 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
				1303	%pSamples.0220 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
				1304	%pTempSrc.0219 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
				1305	%pOutput.0218 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
				1306	%4 = load half, half* %1, align 4
				1307	%5 = load half, half* %incdec.ptr, align 4
				1308	%6 = load half, half* %incdec.ptr7, align 4
				1309	%7 = load half, half* %incdec.ptr8, align 4
				1310	%8 = load half, half* %incdec.ptr9, align 4
				1311	%9 = load half, half* %incdec.ptr10, align 4
				1312	%10 = load half, half* %incdec.ptr11, align 4
				1313	%11 = load half, half* %incdec.ptr12, align 4
				1314	%12 = bitcast half* %pTempSrc.0219 to <8 x half>*
				1315	%13 = load <8 x half>, <8 x half>* %12, align 4
				1316	%14 = bitcast half* %pStateCur.0221 to <8 x half>*
				1317	store <8 x half> %13, <8 x half>* %14, align 4
				1318	%add.ptr = getelementptr inbounds half, half* %pStateCur.0221, i32 4
				1319	%add.ptr14 = getelementptr inbounds half, half* %pTempSrc.0219, i32 4
				1320	%15 = bitcast half* %pSamples.0220 to <8 x half>*
				1321	%16 = load <8 x half>, <8 x half>* %15, align 4
				1322	%.splatinsert = insertelement <8 x half> undef, half %4, i32 0
				1323	%.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
				1324	%17 = fmul fast <8 x half> %16, %.splat
				1325	%arrayidx15 = getelementptr inbounds half, half* %pSamples.0220, i32 1
				1326	%18 = bitcast half* %arrayidx15 to <8 x half>*
				1327	%19 = load <8 x half>, <8 x half>* %18, align 4
				1328	%.splatinsert16 = insertelement <8 x half> undef, half %5, i32 0
				1329	%.splat17 = shufflevector <8 x half> %.splatinsert16, <8 x half> undef, <8 x i32> zeroinitializer
				1330	%20 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %19, <8 x half> %.splat17, <8 x half> %17)
				1331	%arrayidx18 = getelementptr inbounds half, half* %pSamples.0220, i32 2
				1332	%21 = bitcast half* %arrayidx18 to <8 x half>*
				1333	%22 = load <8 x half>, <8 x half>* %21, align 4
				1334	%.splatinsert19 = insertelement <8 x half> undef, half %6, i32 0
				1335	%.splat20 = shufflevector <8 x half> %.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
				1336	%23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %22, <8 x half> %.splat20, <8 x half> %20)
				1337	%arrayidx21 = getelementptr inbounds half, half* %pSamples.0220, i32 3
				1338	%24 = bitcast half* %arrayidx21 to <8 x half>*
				1339	%25 = load <8 x half>, <8 x half>* %24, align 4
				1340	%.splatinsert22 = insertelement <8 x half> undef, half %7, i32 0
				1341	%.splat23 = shufflevector <8 x half> %.splatinsert22, <8 x half> undef, <8 x i32> zeroinitializer
				1342	%26 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %25, <8 x half> %.splat23, <8 x half> %23)
				1343	%arrayidx24 = getelementptr inbounds half, half* %pSamples.0220, i32 4
				1344	%27 = bitcast half* %arrayidx24 to <8 x half>*
				1345	%28 = load <8 x half>, <8 x half>* %27, align 4
				1346	%.splatinsert25 = insertelement <8 x half> undef, half %8, i32 0
				1347	%.splat26 = shufflevector <8 x half> %.splatinsert25, <8 x half> undef, <8 x i32> zeroinitializer
				1348	%29 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %28, <8 x half> %.splat26, <8 x half> %26)
				1349	%arrayidx27 = getelementptr inbounds half, half* %pSamples.0220, i32 5
				1350	%30 = bitcast half* %arrayidx27 to <8 x half>*
				1351	%31 = load <8 x half>, <8 x half>* %30, align 4
				1352	%.splatinsert28 = insertelement <8 x half> undef, half %9, i32 0
				1353	%.splat29 = shufflevector <8 x half> %.splatinsert28, <8 x half> undef, <8 x i32> zeroinitializer
				1354	%32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %31, <8 x half> %.splat29, <8 x half> %29)
				1355	%arrayidx30 = getelementptr inbounds half, half* %pSamples.0220, i32 6
				1356	%33 = bitcast half* %arrayidx30 to <8 x half>*
				1357	%34 = load <8 x half>, <8 x half>* %33, align 4
				1358	%.splatinsert31 = insertelement <8 x half> undef, half %10, i32 0
				1359	%.splat32 = shufflevector <8 x half> %.splatinsert31, <8 x half> undef, <8 x i32> zeroinitializer
				1360	%35 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %34, <8 x half> %.splat32, <8 x half> %32)
				1361	%arrayidx33 = getelementptr inbounds half, half* %pSamples.0220, i32 7
				1362	%36 = bitcast half* %arrayidx33 to <8 x half>*
				1363	%37 = load <8 x half>, <8 x half>* %36, align 4
				1364	%.splatinsert34 = insertelement <8 x half> undef, half %11, i32 0
				1365	%.splat35 = shufflevector <8 x half> %.splatinsert34, <8 x half> undef, <8 x i32> zeroinitializer
				1366	%38 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %37, <8 x half> %.splat35, <8 x half> %35)
				1367	%pSamples.1200 = getelementptr inbounds half, half* %pSamples.0220, i32 8
				1368	br i1 %cmp38201, label %for.body, label %for.end
				1369
				1370	for.body: ; preds = %while.body, %for.body
				1371	%pSamples.1207 = phi half* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
				1372	%pCoeffsCur.0206 = phi half* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
				1373	%.pn205 = phi half* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ]
				1374	%i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
				1375	%vecAcc0.0203 = phi <8 x half> [ %70, %for.body ], [ %38, %while.body ]
				1376	%pSamples.0.pn202 = phi half* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
				1377	%incdec.ptr40 = getelementptr inbounds half, half* %.pn205, i32 9
				1378	%39 = load half, half* %pCoeffsCur.0206, align 4
				1379	%incdec.ptr41 = getelementptr inbounds half, half* %.pn205, i32 10
				1380	%40 = load half, half* %incdec.ptr40, align 4
				1381	%incdec.ptr42 = getelementptr inbounds half, half* %.pn205, i32 11
				1382	%41 = load half, half* %incdec.ptr41, align 4
				1383	%incdec.ptr43 = getelementptr inbounds half, half* %.pn205, i32 12
				1384	%42 = load half, half* %incdec.ptr42, align 4
				1385	%incdec.ptr44 = getelementptr inbounds half, half* %.pn205, i32 13
				1386	%43 = load half, half* %incdec.ptr43, align 4
				1387	%incdec.ptr45 = getelementptr inbounds half, half* %.pn205, i32 14
				1388	%44 = load half, half* %incdec.ptr44, align 4
				1389	%incdec.ptr46 = getelementptr inbounds half, half* %.pn205, i32 15
				1390	%45 = load half, half* %incdec.ptr45, align 4
				1391	%46 = load half, half* %incdec.ptr46, align 4
				1392	%47 = bitcast half* %pSamples.1207 to <8 x half>*
				1393	%48 = load <8 x half>, <8 x half>* %47, align 4
				1394	%.splatinsert48 = insertelement <8 x half> undef, half %39, i32 0
				1395	%.splat49 = shufflevector <8 x half> %.splatinsert48, <8 x half> undef, <8 x i32> zeroinitializer
				1396	%49 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %48, <8 x half> %.splat49, <8 x half> %vecAcc0.0203)
				1397	%arrayidx50 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 9
				1398	%50 = bitcast half* %arrayidx50 to <8 x half>*
				1399	%51 = load <8 x half>, <8 x half>* %50, align 4
				1400	%.splatinsert51 = insertelement <8 x half> undef, half %40, i32 0
				1401	%.splat52 = shufflevector <8 x half> %.splatinsert51, <8 x half> undef, <8 x i32> zeroinitializer
				1402	%52 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %51, <8 x half> %.splat52, <8 x half> %49)
				1403	%arrayidx53 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 10
				1404	%53 = bitcast half* %arrayidx53 to <8 x half>*
				1405	%54 = load <8 x half>, <8 x half>* %53, align 4
				1406	%.splatinsert54 = insertelement <8 x half> undef, half %41, i32 0
				1407	%.splat55 = shufflevector <8 x half> %.splatinsert54, <8 x half> undef, <8 x i32> zeroinitializer
				1408	%55 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %54, <8 x half> %.splat55, <8 x half> %52)
				1409	%arrayidx56 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 11
				1410	%56 = bitcast half* %arrayidx56 to <8 x half>*
				1411	%57 = load <8 x half>, <8 x half>* %56, align 4
				1412	%.splatinsert57 = insertelement <8 x half> undef, half %42, i32 0
				1413	%.splat58 = shufflevector <8 x half> %.splatinsert57, <8 x half> undef, <8 x i32> zeroinitializer
				1414	%58 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %57, <8 x half> %.splat58, <8 x half> %55)
				1415	%arrayidx59 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 12
				1416	%59 = bitcast half* %arrayidx59 to <8 x half>*
				1417	%60 = load <8 x half>, <8 x half>* %59, align 4
				1418	%.splatinsert60 = insertelement <8 x half> undef, half %43, i32 0
				1419	%.splat61 = shufflevector <8 x half> %.splatinsert60, <8 x half> undef, <8 x i32> zeroinitializer
				1420	%61 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %60, <8 x half> %.splat61, <8 x half> %58)
				1421	%arrayidx62 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 13
				1422	%62 = bitcast half* %arrayidx62 to <8 x half>*
				1423	%63 = load <8 x half>, <8 x half>* %62, align 4
				1424	%.splatinsert63 = insertelement <8 x half> undef, half %44, i32 0
				1425	%.splat64 = shufflevector <8 x half> %.splatinsert63, <8 x half> undef, <8 x i32> zeroinitializer
				1426	%64 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %63, <8 x half> %.splat64, <8 x half> %61)
				1427	%arrayidx65 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 14
				1428	%65 = bitcast half* %arrayidx65 to <8 x half>*
				1429	%66 = load <8 x half>, <8 x half>* %65, align 4
				1430	%.splatinsert66 = insertelement <8 x half> undef, half %45, i32 0
				1431	%.splat67 = shufflevector <8 x half> %.splatinsert66, <8 x half> undef, <8 x i32> zeroinitializer
				1432	%67 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %66, <8 x half> %.splat67, <8 x half> %64)
				1433	%arrayidx68 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 15
				1434	%68 = bitcast half* %arrayidx68 to <8 x half>*
				1435	%69 = load <8 x half>, <8 x half>* %68, align 4
				1436	%.splatinsert69 = insertelement <8 x half> undef, half %46, i32 0
				1437	%.splat70 = shufflevector <8 x half> %.splatinsert69, <8 x half> undef, <8 x i32> zeroinitializer
				1438	%70 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %69, <8 x half> %.splat70, <8 x half> %67)
				1439	%inc = add nuw nsw i32 %i.0204, 1
				1440	%pCoeffsCur.0 = getelementptr inbounds half, half* %pCoeffsCur.0206, i32 8
				1441	%pSamples.1 = getelementptr inbounds half, half* %pSamples.1207, i32 8
				1442	%exitcond = icmp eq i32 %inc, %smax
				1443	br i1 %exitcond, label %for.end, label %for.body
				1444
				1445	for.end: ; preds = %for.body, %while.body
				1446	%vecAcc0.0.lcssa = phi <8 x half> [ %38, %while.body ], [ %70, %for.body ]
				1447	%pCoeffsCur.0.lcssa = phi half* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
				1448	%pSamples.1.lcssa = phi half* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
				1449	br i1 %cmp74210, label %while.end, label %while.body76
				1450
				1451	while.body76: ; preds = %for.end, %while.body76
				1452	%pCoeffsCur.1214 = phi half* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
				1453	%vecAcc0.1213 = phi <8 x half> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
				1454	%numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
				1455	%pSamples.2211 = phi half* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
				1456	%incdec.ptr77 = getelementptr inbounds half, half* %pCoeffsCur.1214, i32 1
				1457	%71 = load half, half* %pCoeffsCur.1214, align 4
				1458	%72 = bitcast half* %pSamples.2211 to <8 x half>*
				1459	%73 = load <8 x half>, <8 x half>* %72, align 4
				1460	%.splatinsert78 = insertelement <8 x half> undef, half %71, i32 0
				1461	%.splat79 = shufflevector <8 x half> %.splatinsert78, <8 x half> undef, <8 x i32> zeroinitializer
				1462	%74 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %73, <8 x half> %.splat79, <8 x half> %vecAcc0.1213)
				1463	%incdec.ptr80 = getelementptr inbounds half, half* %pSamples.2211, i32 1
				1464	%dec = add nsw i32 %numCnt.0212, -1
				1465	%cmp74 = icmp sgt i32 %numCnt.0212, 1
				1466	br i1 %cmp74, label %while.body76, label %while.end.loopexit
				1467
				1468	while.end.loopexit: ; preds = %while.body76
				1469	%scevgep = getelementptr half, half* %pSamples.1.lcssa, i32 %and
				1470	br label %while.end
				1471
				1472	while.end: ; preds = %while.end.loopexit, %for.end
				1473	%pSamples.2.lcssa = phi half* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
				1474	%vecAcc0.1.lcssa = phi <8 x half> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ]
				1475	%75 = bitcast half* %pOutput.0218 to <8 x half>*
				1476	store <8 x half> %vecAcc0.1.lcssa, <8 x half>* %75, align 4
				1477	%add.ptr81 = getelementptr inbounds half, half* %pOutput.0218, i32 4
				1478	%add.ptr82 = getelementptr inbounds half, half* %pSamples.2.lcssa, i32 4
				1479	%add.ptr83 = getelementptr inbounds half, half* %add.ptr82, i32 %idx.neg
				1480	%dec84 = add nsw i32 %blkCnt.0222, -1
				1481	%cmp5 = icmp eq i32 %dec84, 0
				1482	br i1 %cmp5, label %if.end, label %while.body
				1483
				1484	if.end: ; preds = %while.end, %if.then, %entry
				1485	ret void
				1486	}
				1487
				1488	declare void @llvm.assume(i1)
				1489	declare <8 x i1> @llvm.arm.mve.vctp16(i32)
				1490	declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
				1491	declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>)