[ARM] Fold VMOVrh VLDR to LDRH
This adds a simple fold to combine VMOVrh load to a integer load.
Similar to what is already performed for BITCAST, but needs to account
for the types being of different sizes, creating an zero extending load.
Differential Revision: https://reviews.llvm.org/D76485
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 02259fe..8161b13 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -742,9 +742,8 @@
; CHECK-NEXT: .LBB14_1: @ %for.body.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
-; CHECK-NEXT: vldr.16 s0, [r1]
+; CHECK-NEXT: ldrh r4, [r1]
; CHECK-NEXT: mov r5, r12
-; CHECK-NEXT: vmov.f16 r4, s0
; CHECK-NEXT: vdup.16 q0, r4
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: .LBB14_2: @ %vector.body
@@ -825,46 +824,42 @@
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: ldrh.w r9, [r0]
+; CHECK-NEXT: ldrh.w r10, [r0]
; CHECK-NEXT: ldr.w r12, [r0, #4]
-; CHECK-NEXT: sub.w r7, r9, #1
+; CHECK-NEXT: sub.w r7, r10, #1
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhi .LBB15_6
; CHECK-NEXT: @ %bb.1: @ %if.then
; CHECK-NEXT: ldr r6, [r0, #8]
-; CHECK-NEXT: vldr.16 s0, [r6]
-; CHECK-NEXT: vmov.f16 lr, s0
-; CHECK-NEXT: vldr.16 s0, [r6, #2]
-; CHECK-NEXT: vdup.16 q3, lr
; CHECK-NEXT: lsr.w lr, r3, #2
-; CHECK-NEXT: vmov.f16 r5, s0
-; CHECK-NEXT: vldr.16 s0, [r6, #4]
-; CHECK-NEXT: vdup.16 q2, r5
-; CHECK-NEXT: vmov.f16 r4, s0
-; CHECK-NEXT: vldr.16 s0, [r6, #6]
+; CHECK-NEXT: ldrh r4, [r6, #6]
+; CHECK-NEXT: vdup.16 q0, r4
+; CHECK-NEXT: ldrh r4, [r6, #4]
; CHECK-NEXT: vdup.16 q1, r4
+; CHECK-NEXT: ldrh r4, [r6, #2]
+; CHECK-NEXT: ldrh r6, [r6]
+; CHECK-NEXT: vdup.16 q2, r4
; CHECK-NEXT: add.w r4, r12, r7, lsl #1
-; CHECK-NEXT: vmov.f16 r6, s0
-; CHECK-NEXT: vdup.16 q0, r6
+; CHECK-NEXT: vdup.16 q3, r6
; CHECK-NEXT: wls lr, lr, .LBB15_5
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
-; CHECK-NEXT: bic r10, r3, #3
+; CHECK-NEXT: bic r9, r3, #3
; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: add.w r8, r2, r10, lsl #1
+; CHECK-NEXT: add.w r8, r2, r9, lsl #1
; CHECK-NEXT: .LBB15_3: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: adds r5, r1, r6
-; CHECK-NEXT: vldrw.u32 q4, [r5]
-; CHECK-NEXT: adds r5, r4, r6
-; CHECK-NEXT: vstrw.32 q4, [r5]
-; CHECK-NEXT: add.w r5, r12, r6
-; CHECK-NEXT: vldrw.u32 q4, [r5]
-; CHECK-NEXT: adds r7, r5, #2
-; CHECK-NEXT: vldrw.u32 q5, [r7]
+; CHECK-NEXT: adds r7, r1, r6
+; CHECK-NEXT: vldrw.u32 q4, [r7]
+; CHECK-NEXT: adds r7, r4, r6
+; CHECK-NEXT: vstrw.32 q4, [r7]
+; CHECK-NEXT: add.w r7, r12, r6
+; CHECK-NEXT: vldrw.u32 q4, [r7]
+; CHECK-NEXT: adds r5, r7, #2
+; CHECK-NEXT: vldrw.u32 q5, [r5]
+; CHECK-NEXT: adds r5, r7, #6
; CHECK-NEXT: vmul.f16 q4, q4, q3
; CHECK-NEXT: vfma.f16 q4, q5, q2
-; CHECK-NEXT: vldrw.u32 q5, [r5, #4]
-; CHECK-NEXT: adds r5, #6
+; CHECK-NEXT: vldrw.u32 q5, [r7, #4]
; CHECK-NEXT: vfma.f16 q4, q5, q1
; CHECK-NEXT: vldrw.u32 q5, [r5]
; CHECK-NEXT: adds r5, r2, r6
@@ -874,8 +869,8 @@
; CHECK-NEXT: le lr, .LBB15_3
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
; CHECK-NEXT: add r4, r6
-; CHECK-NEXT: add.w r12, r12, r10, lsl #1
-; CHECK-NEXT: add.w r1, r1, r10, lsl #1
+; CHECK-NEXT: add.w r12, r12, r9, lsl #1
+; CHECK-NEXT: add.w r1, r1, r9, lsl #1
; CHECK-NEXT: mov r2, r8
; CHECK-NEXT: .LBB15_5: @ %while.end
; CHECK-NEXT: and r7, r3, #3
@@ -898,10 +893,10 @@
; CHECK-NEXT: ldr.w r12, [r0, #4]
; CHECK-NEXT: .LBB15_6: @ %if.end
; CHECK-NEXT: add.w r0, r12, r3, lsl #1
-; CHECK-NEXT: lsr.w lr, r9, #2
+; CHECK-NEXT: lsr.w lr, r10, #2
; CHECK-NEXT: wls lr, lr, .LBB15_10
; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
-; CHECK-NEXT: bic r2, r9, #3
+; CHECK-NEXT: bic r2, r10, #3
; CHECK-NEXT: adds r1, r2, r3
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: add.w r1, r12, r1, lsl #1
@@ -914,7 +909,7 @@
; CHECK-NEXT: add.w r12, r12, r2, lsl #1
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: .LBB15_10: @ %while.end55
-; CHECK-NEXT: ands r1, r9, #3
+; CHECK-NEXT: ands r1, r10, #3
; CHECK-NEXT: beq .LBB15_12
; CHECK-NEXT: @ %bb.11: @ %if.then59
; CHECK-NEXT: vldrw.u32 q0, [r0]
@@ -1095,170 +1090,154 @@
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: .pad #28
+; CHECK-NEXT: sub sp, #28
; CHECK-NEXT: cmp r3, #8
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: blo.w .LBB16_12
; CHECK-NEXT: @ %bb.1: @ %if.then
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: cmp.w r7, r3, lsr #2
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
-; CHECK-NEXT: ldrh.w r11, [r0]
-; CHECK-NEXT: mov.w r8, #1
+; CHECK-NEXT: ldrh r4, [r0]
+; CHECK-NEXT: movs r1, #1
; CHECK-NEXT: ldrd r5, r12, [r0, #4]
-; CHECK-NEXT: lsrs r3, r3, #2
-; CHECK-NEXT: sub.w r0, r11, #8
-; CHECK-NEXT: and r10, r0, #7
+; CHECK-NEXT: lsr.w r11, r3, #2
+; CHECK-NEXT: sub.w r0, r4, #8
+; CHECK-NEXT: rsbs r3, r4, #0
; CHECK-NEXT: add.w r7, r0, r0, lsr #29
-; CHECK-NEXT: add.w r0, r10, #1
+; CHECK-NEXT: and r0, r0, #7
; CHECK-NEXT: asrs r6, r7, #3
; CHECK-NEXT: cmp r6, #1
; CHECK-NEXT: it gt
-; CHECK-NEXT: asrgt.w r8, r7, #3
-; CHECK-NEXT: add.w r7, r5, r11, lsl #1
-; CHECK-NEXT: subs r4, r7, #2
-; CHECK-NEXT: rsb.w r7, r11, #0
-; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: add.w r7, r12, #16
-; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: asrgt r1, r7, #3
+; CHECK-NEXT: add.w r7, r5, r4, lsl #1
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT: subs r1, r7, #2
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: add.w r3, r12, #16
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: adds r0, #1
+; CHECK-NEXT: str r4, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_3: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: subs r3, #1
+; CHECK-NEXT: subs.w r11, r11, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #8
-; CHECK-NEXT: add.w r0, r9, r0, lsl #1
+; CHECK-NEXT: add.w r0, r7, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: .LBB16_4: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
-; CHECK-NEXT: vldrw.u32 q0, [r1], #8
-; CHECK-NEXT: vldr.16 s7, [r12]
-; CHECK-NEXT: vldr.16 s4, [r12, #14]
-; CHECK-NEXT: vldr.16 s6, [r12, #12]
-; CHECK-NEXT: vldr.16 s8, [r12, #10]
-; CHECK-NEXT: vldr.16 s10, [r12, #8]
-; CHECK-NEXT: vldr.16 s12, [r12, #6]
-; CHECK-NEXT: vldr.16 s14, [r12, #4]
-; CHECK-NEXT: vldr.16 s5, [r12, #2]
-; CHECK-NEXT: vstrb.8 q0, [r4], #8
-; CHECK-NEXT: adds r6, r5, #2
+; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: ldrh.w lr, [r12, #14]
+; CHECK-NEXT: vldrw.u32 q0, [r0], #8
+; CHECK-NEXT: ldrh.w r10, [r12, #12]
+; CHECK-NEXT: ldrh.w r7, [r12, #10]
+; CHECK-NEXT: ldrh.w r4, [r12, #8]
+; CHECK-NEXT: ldrh.w r3, [r12, #6]
+; CHECK-NEXT: ldrh.w r6, [r12, #4]
+; CHECK-NEXT: ldrh.w r8, [r12, #2]
+; CHECK-NEXT: ldrh.w r9, [r12]
+; CHECK-NEXT: vstrb.8 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q0, [r5]
-; CHECK-NEXT: vmov.f16 r0, s7
-; CHECK-NEXT: vldrw.u32 q4, [r6]
-; CHECK-NEXT: vmul.f16 q0, q0, r0
-; CHECK-NEXT: vmov.f16 r0, s5
-; CHECK-NEXT: vfma.f16 q0, q4, r0
-; CHECK-NEXT: vldrw.u32 q4, [r5, #4]
-; CHECK-NEXT: vmov.f16 r0, s14
-; CHECK-NEXT: adds r6, r5, #6
-; CHECK-NEXT: vfma.f16 q0, q4, r0
-; CHECK-NEXT: vmov.f16 r0, s12
-; CHECK-NEXT: vldrw.u32 q3, [r6]
-; CHECK-NEXT: add.w r6, r5, #10
-; CHECK-NEXT: add.w r9, r5, #16
-; CHECK-NEXT: cmp.w r11, #16
-; CHECK-NEXT: vfma.f16 q0, q3, r0
-; CHECK-NEXT: vldrw.u32 q3, [r5, #8]
-; CHECK-NEXT: vmov.f16 r0, s10
-; CHECK-NEXT: vfma.f16 q0, q3, r0
-; CHECK-NEXT: vmov.f16 r0, s8
-; CHECK-NEXT: vldrw.u32 q2, [r6]
-; CHECK-NEXT: add.w r6, r5, #14
-; CHECK-NEXT: vfma.f16 q0, q2, r0
-; CHECK-NEXT: vldrw.u32 q2, [r5, #12]
-; CHECK-NEXT: vmov.f16 r0, s6
-; CHECK-NEXT: vfma.f16 q0, q2, r0
-; CHECK-NEXT: vmov.f16 r0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r6]
-; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: blo .LBB16_8
+; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: adds r0, r5, #2
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmul.f16 q0, q0, r9
+; CHECK-NEXT: adds r0, r5, #6
+; CHECK-NEXT: vfma.f16 q0, q1, r8
+; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
+; CHECK-NEXT: vfma.f16 q0, q1, r6
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: add.w r0, r5, #10
+; CHECK-NEXT: vfma.f16 q0, q1, r3
+; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT: vfma.f16 q0, q1, r4
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: add.w r0, r5, #14
+; CHECK-NEXT: vfma.f16 q0, q1, r7
+; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
+; CHECK-NEXT: add.w r7, r5, #16
+; CHECK-NEXT: vfma.f16 q0, q1, r10
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: vfma.f16 q0, q1, lr
+; CHECK-NEXT: cmp r0, #16
+; CHECK-NEXT: blo .LBB16_7
; CHECK-NEXT: @ %bb.5: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT: dls lr, r8
+; CHECK-NEXT: ldr.w lr, [sp] @ 4-byte Reload
+; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: .LBB16_6: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vldr.16 s4, [r6]
-; CHECK-NEXT: add.w r5, r9, #2
-; CHECK-NEXT: vmov.f16 r0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r9]
+; CHECK-NEXT: ldrh r0, [r6]
+; CHECK-NEXT: vldrw.u32 q1, [r7]
+; CHECK-NEXT: adds r3, r7, #2
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldr.16 s4, [r6, #2]
-; CHECK-NEXT: vmov.f16 r0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: add.w r5, r9, #6
+; CHECK-NEXT: vldrw.u32 q1, [r3]
+; CHECK-NEXT: ldrh r0, [r6, #2]
+; CHECK-NEXT: adds r3, r7, #6
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldr.16 s4, [r6, #4]
-; CHECK-NEXT: vmov.f16 r0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r9, #4]
+; CHECK-NEXT: ldrh r0, [r6, #4]
+; CHECK-NEXT: vldrw.u32 q1, [r7, #4]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldr.16 s4, [r6, #6]
-; CHECK-NEXT: vmov.f16 r0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: add.w r5, r9, #10
+; CHECK-NEXT: vldrw.u32 q1, [r3]
+; CHECK-NEXT: ldrh r0, [r6, #6]
+; CHECK-NEXT: add.w r3, r7, #10
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldr.16 s4, [r6, #8]
-; CHECK-NEXT: vmov.f16 r0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r9, #8]
+; CHECK-NEXT: ldrh r0, [r6, #8]
+; CHECK-NEXT: vldrw.u32 q1, [r7, #8]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldr.16 s4, [r6, #10]
-; CHECK-NEXT: vmov.f16 r0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: add.w r5, r9, #14
+; CHECK-NEXT: vldrw.u32 q1, [r3]
+; CHECK-NEXT: ldrh r0, [r6, #10]
+; CHECK-NEXT: ldrh r3, [r6, #14]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldr.16 s4, [r6, #12]
-; CHECK-NEXT: vmov.f16 r0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r9, #12]
-; CHECK-NEXT: add.w r9, r9, #16
-; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: vldr.16 s4, [r6, #14]
+; CHECK-NEXT: ldrh r0, [r6, #12]
+; CHECK-NEXT: vldrw.u32 q1, [r7, #12]
; CHECK-NEXT: adds r6, #16
-; CHECK-NEXT: vmov.f16 r0, s4
-; CHECK-NEXT: vldrw.u32 q1, [r5]
; CHECK-NEXT: vfma.f16 q0, q1, r0
+; CHECK-NEXT: add.w r0, r7, #14
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: adds r7, #16
+; CHECK-NEXT: vfma.f16 q0, q1, r3
; CHECK-NEXT: le lr, .LBB16_6
-; CHECK-NEXT: @ %bb.7: @ %for.end
-; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT: cmp.w r10, #0
-; CHECK-NEXT: bne .LBB16_9
-; CHECK-NEXT: b .LBB16_3
-; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT: b .LBB16_8
+; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: cmp.w r10, #0
+; CHECK-NEXT: .LBB16_8: @ %for.end
+; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: beq.w .LBB16_3
-; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
+; CHECK-NEXT: @ %bb.9: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: mov r5, r9
+; CHECK-NEXT: mov r5, r7
; CHECK-NEXT: .LBB16_10: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vldr.16 s4, [r6]
-; CHECK-NEXT: subs r0, #1
-; CHECK-NEXT: adds r6, #2
-; CHECK-NEXT: cmp r0, #1
-; CHECK-NEXT: vmov.f16 r7, s4
+; CHECK-NEXT: ldrh r3, [r6], #2
; CHECK-NEXT: vldrh.u16 q1, [r5], #2
-; CHECK-NEXT: vfma.f16 q0, q1, r7
+; CHECK-NEXT: subs r0, #1
+; CHECK-NEXT: vfma.f16 q0, q1, r3
+; CHECK-NEXT: cmp r0, #1
; CHECK-NEXT: bgt .LBB16_10
; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT: add.w r9, r9, r10, lsl #1
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add.w r7, r7, r0, lsl #1
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_12: @ %if.end
-; CHECK-NEXT: add sp, #16
-; CHECK-NEXT: vpop {d8, d9}
-; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: add sp, #28
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1