blob: 4e348ec646d4297ff2ee5217d380fc1a91fd6e55 [file] [log] [blame]
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +00001@
2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS. All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10
11@ This file contains the function WebRtcSpl_DownsampleFastNeon(), optimized for
12@ ARM Neon platform. The description header can be found in
13@ signal_processing_library.h
14@
15@ The reference C code is in file downsample_fast.c. Bit-exact.
16
kma@webrtc.org9fc62502012-11-17 00:22:46 +000017#include "webrtc/system_wrappers/interface/asm_defines.h"
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000018
kma@webrtc.org9fc62502012-11-17 00:22:46 +000019GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000020.align 2
kma@webrtc.org9fc62502012-11-17 00:22:46 +000021DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000022 push {r4-r11}
23
24 cmp r3, #0 @ data_out_length <= 0?
25 movle r0, #-1
26 ble END
27
28 ldrsh r12, [sp, #44]
29 ldr r5, [sp, #40] @ r5: factor
30 add r4, r12, #1 @ r4: delay + 1
31 sub r3, r3, #1 @ r3: data_out_length - 1
32 smulbb r3, r5, r3
33 ldr r8, [sp, #32] @ &coefficients[0]
34 mov r9, r12 @ Iteration counter for outer loops.
35 add r3, r4 @ delay + factor * (out_length-1) +1
36
37 cmp r3, r1 @ data_in_length < endpos?
38 movgt r0, #-1
39 bgt END
40
41 @ Initializations.
42 sub r3, r5, asl #3
43 add r11, r0, r12, asl #1 @ &data_in[delay]
44 ldr r0, [sp, #36] @ coefficients_length
45 add r3, r5 @ endpos - factor * 7
46
47 cmp r0, #0 @ coefficients_length <= 0 ?
48 movle r0, #-1
49 ble END
50
51 add r8, r0, asl #1 @ &coeffieient[coefficients_length]
52 cmp r9, r3
53 bge POST_LOOP_ENDPOS @ branch when Iteration < 8 times.
54
55@
56@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
57@
58 mov r4, #-2
59
60 @ Direct program flow to the right channel.
61
62 @ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
63 @ move the pointer back to original after advancing 16 bytes by a vld1, and
64 @ then move 2 bytes forward to increment one more sample.
65 cmp r5, #2
66 moveq r10, #-14
67 beq LOOP_ENDPOS_FACTOR2 @ Branch when factor == 2
68
69 @ Similar here, for r10, we need to move the pointer back to original after
70 @ advancing 32 bytes, then move 2 bytes forward to increment one sample.
71 cmp r5, #4
72 moveq r10, #-30
73 beq LOOP_ENDPOS_FACTOR4 @ Branch when factor == 4
74
75 @ For r10, we need to move the pointer back to original after advancing
76 @ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
77 mov r10, r5, asl #4
78 rsb r10, #2
79 add r10, r5, asl #1
80 lsl r5, #1 @ r5 = factor * sizeof(data_in)
81
82@ The general case (factor != 2 && factor != 4)
83LOOP_ENDPOS_GENERAL:
84 @ Initializations.
85 vmov.i32 q2, #2048
86 vmov.i32 q3, #2048
87 sub r7, r8, #2
88 sub r12, r0, #1 @ coefficients_length - 1
89 sub r1, r11, r12, asl #1 @ &data_in[i - j]
90
91LOOP_COEFF_LENGTH_GENERAL:
92 vld1.16 {d2[], d3[]}, [r7], r4 @ coefficients[j]
93 vld1.16 d0[0], [r1], r5 @ data_in[i - j]
94 vld1.16 d0[1], [r1], r5 @ data_in[i + factor - j]
95 vld1.16 d0[2], [r1], r5 @ data_in[i + factor * 2 - j]
96 vld1.16 d0[3], [r1], r5 @ data_in[i + factor * 3 - j]
97 vld1.16 d1[0], [r1], r5 @ data_in[i + factor * 4 - j]
98 vld1.16 d1[1], [r1], r5 @ data_in[i + factor * 5 - j]
99 vld1.16 d1[2], [r1], r5 @ data_in[i + factor * 6 - j]
100 vld1.16 d1[3], [r1], r10 @ data_in[i + factor * 7 - j]
101 subs r12, #1
102 vmlal.s16 q2, d0, d2
103 vmlal.s16 q3, d1, d3
104 bge LOOP_COEFF_LENGTH_GENERAL
105
106 @ Shift, saturate, and store the result.
107 vqshrn.s32 d0, q2, #12
108 vqshrn.s32 d1, q3, #12
109 vst1.16 {d0, d1}, [r2]!
110
111 add r11, r5, asl #3 @ r11 -> &data_in[i + factor * 8]
112 add r9, r5, asl #2 @ Counter i = delay + factor * 8.
113 cmp r9, r3 @ i < endpos - factor * 7 ?
114 blt LOOP_ENDPOS_GENERAL
115 asr r5, #1 @ Restore r5 to the value of factor.
116 b POST_LOOP_ENDPOS
117
118@ The case for factor == 2.
119LOOP_ENDPOS_FACTOR2:
120 @ Initializations.
121 vmov.i32 q2, #2048
122 vmov.i32 q3, #2048
123 sub r7, r8, #2
124 sub r12, r0, #1 @ coefficients_length - 1
125 sub r1, r11, r12, asl #1 @ &data_in[i - j]
126
127LOOP_COEFF_LENGTH_FACTOR2:
128 vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
129 vld2.16 {d0, d1}, [r1]! @ data_in[]
130 vld2.16 {d2, d3}, [r1], r10 @ data_in[]
131 subs r12, #1
132 vmlal.s16 q2, d0, d16
133 vmlal.s16 q3, d2, d17
134 bge LOOP_COEFF_LENGTH_FACTOR2
135
136 @ Shift, saturate, and store the result.
137 vqshrn.s32 d0, q2, #12
138 vqshrn.s32 d1, q3, #12
139 vst1.16 {d0, d1}, [r2]!
140
141 add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
142 add r9, r5, asl #3 @ Counter i = delay + factor * 8.
143 cmp r9, r3 @ i < endpos - factor * 7 ?
144 blt LOOP_ENDPOS_FACTOR2
145 b POST_LOOP_ENDPOS
146
147@ The case for factor == 4.
148LOOP_ENDPOS_FACTOR4:
149 @ Initializations.
150 vmov.i32 q2, #2048
151 vmov.i32 q3, #2048
152 sub r7, r8, #2
153 sub r12, r0, #1 @ coefficients_length - 1
154 sub r1, r11, r12, asl #1 @ &data_in[i - j]
155
156LOOP_COEFF_LENGTH_FACTOR4:
157 vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
158 vld4.16 {d0, d1, d2, d3}, [r1]! @ data_in[]
159 vld4.16 {d18, d19, d20, d21}, [r1], r10 @ data_in[]
160 subs r12, #1
161 vmlal.s16 q2, d0, d16
162 vmlal.s16 q3, d18, d17
163 bge LOOP_COEFF_LENGTH_FACTOR4
164
kma@webrtc.org9fc62502012-11-17 00:22:46 +0000165 add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
166 add r9, r5, asl #3 @ Counter i = delay + factor * 8.
167
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000168 @ Shift, saturate, and store the result.
169 vqshrn.s32 d0, q2, #12
170 vqshrn.s32 d1, q3, #12
kma@webrtc.org9fc62502012-11-17 00:22:46 +0000171 cmp r9, r3 @ i < endpos - factor * 7 ?
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000172 vst1.16 {d0, d1}, [r2]!
173
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000174 blt LOOP_ENDPOS_FACTOR4
175
176@
177@ Second part, do the rest iterations (if any).
178@
179
180POST_LOOP_ENDPOS:
181 add r3, r5, asl #3
182 sub r3, r5 @ Restore r3 to endpos.
183 cmp r9, r3
184 movge r0, #0
185 bge END
186
187LOOP2_ENDPOS:
188 @ Initializations.
189 mov r7, r8
190 sub r12, r0, #1 @ coefficients_length - 1
191 sub r6, r11, r12, asl #1 @ &data_in[i - j]
192
193 mov r1, #2048
194
195LOOP2_COEFF_LENGTH:
196 ldrsh r4, [r7, #-2]! @ coefficients[j]
197 ldrsh r10, [r6], #2 @ data_in[i - j]
198 smlabb r1, r4, r10, r1
199 subs r12, #1
200 bge LOOP2_COEFF_LENGTH
201
202 @ Shift, saturate, and store the result.
203 ssat r1, #16, r1, asr #12
204 strh r1, [r2], #2
205
206 add r11, r5, asl #1 @ r11 -> &data_in[i + factor]
207 add r9, r5 @ Counter i = delay + factor.
208 cmp r9, r3 @ i < endpos?
209 blt LOOP2_ENDPOS
210
211 mov r0, #0
212
213END:
214 pop {r4-r11}
215 bx lr