blob: ff60cc6198214c49df6e61fd883bde0be50188fa [file] [log] [blame]
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +00001@
2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS. All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10
11@ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for
12@ ARMv7 platform. The description header can be found in
13@ signal_processing_library.h
14@
15@ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and
16@ the reference C code at end of this file.
17
18@ Assumptions:
19@ (1) data_length > 0
20@ (2) coefficients_length > 1
21
22@ Register usage:
23@
24@ r0: &data_in[i]
25@ r1: &data_out[i], for result ouput
26@ r2: &coefficients[0]
27@ r3: coefficients_length
28@ r4: Iteration counter for the outer loop.
29@ r5: data_out[j] as multiplication inputs
30@ r6: Calculated value for output data_out[]; interation counter for inner loop
31@ r7: Partial sum of a filtering multiplication results
32@ r8: Partial sum of a filtering multiplication results
33@ r9: &data_out[], for filtering input; data_in[i]
34@ r10: coefficients[j]
35@ r11: Scratch
36@ r12: &coefficients[j]
37
kma@webrtc.org9fc62502012-11-17 00:22:46 +000038#include "webrtc/system_wrappers/interface/asm_defines.h"
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000039
kma@webrtc.org9fc62502012-11-17 00:22:46 +000040GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000041.align 2
kma@webrtc.org9fc62502012-11-17 00:22:46 +000042DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000043 push {r4-r11}
44
45 ldrsh r12, [sp, #32] @ data_length
46 subs r4, r12, #1
47 beq ODD_LENGTH @ jump if data_length == 1
48
49LOOP_LENGTH:
50 add r12, r2, r3, lsl #1
51 sub r12, #4 @ &coefficients[coefficients_length - 2]
52 sub r9, r1, r3, lsl #1
53 add r9, #2 @ &data_out[i - coefficients_length + 1]
54 ldr r5, [r9], #4 @ data_out[i - coefficients_length + {1,2}]
55
56 mov r7, #0 @ sum1
57 mov r8, #0 @ sum2
58 subs r6, r3, #3 @ Iteration counter for inner loop.
59 beq ODD_A_LENGTH @ branch if coefficients_length == 3
60 blt POST_LOOP_A_LENGTH @ branch if coefficients_length == 2
61
62LOOP_A_LENGTH:
63 ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j]
64 subs r6, #2
65 smlatt r8, r10, r5, r8 @ sum2 += coefficients[j] * data_out[i - j + 1];
66 smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j];
67 smlabt r7, r10, r5, r7 @ coefficients[j - 1] * data_out[i - j + 1];
68 ldr r5, [r9], #4 @ data_out[i - j + 2], data_out[i - j + 3]
69 smlabb r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 2];
70 bgt LOOP_A_LENGTH
71 blt POST_LOOP_A_LENGTH
72
73ODD_A_LENGTH:
74 ldrsh r10, [r12, #2] @ Filter coefficients coefficients[2]
75 sub r12, #2 @ &coefficients[0]
76 smlabb r7, r10, r5, r7 @ sum1 += coefficients[2] * data_out[i - 2];
77 smlabt r8, r10, r5, r8 @ sum2 += coefficients[2] * data_out[i - 1];
78 ldr r5, [r9, #-2] @ data_out[i - 1], data_out[i]
79
80POST_LOOP_A_LENGTH:
81 ldr r10, [r12] @ coefficients[0], coefficients[1]
82 smlatb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1];
83
84 ldr r9, [r0], #4 @ data_in[i], data_in[i + 1]
85 smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i];
86 sub r6, r7 @ output1 -= sum1;
87
88 sbfx r11, r6, #12, #16
89 ssat r7, #16, r6, asr #12
90 cmp r7, r11
91 addeq r6, r6, #2048
92 ssat r6, #16, r6, asr #12
93 strh r6, [r1], #2 @ Store data_out[i]
94
95 smlatb r8, r10, r6, r8 @ sum2 += coefficients[1] * data_out[i];
96 smulbt r6, r10, r9 @ output2 = coefficients[0] * data_in[i + 1];
97 sub r6, r8 @ output1 -= sum1;
98
99 sbfx r11, r6, #12, #16
100 ssat r7, #16, r6, asr #12
101 cmp r7, r11
102 addeq r6, r6, #2048
103 ssat r6, #16, r6, asr #12
104 strh r6, [r1], #2 @ Store data_out[i + 1]
105
106 subs r4, #2
107 bgt LOOP_LENGTH
108 blt END @ For even data_length, it's done. Jump to END.
109
110@ Process i = data_length -1, for the case of an odd length.
111ODD_LENGTH:
112 add r12, r2, r3, lsl #1
113 sub r12, #4 @ &coefficients[coefficients_length - 2]
114 sub r9, r1, r3, lsl #1
115 add r9, #2 @ &data_out[i - coefficients_length + 1]
116 mov r7, #0 @ sum1
117 mov r8, #0 @ sum1
118 subs r6, r3, #2 @ inner loop counter
119 beq EVEN_A_LENGTH @ branch if coefficients_length == 2
120
121LOOP2_A_LENGTH:
122 ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j]
123 ldr r5, [r9], #4 @ data_out[i - j], data_out[i - j + 1]
124 subs r6, #2
125 smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j];
126 smlabt r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 1];
127 bgt LOOP2_A_LENGTH
128 addlt r12, #2
129 blt POST_LOOP2_A_LENGTH
130
131EVEN_A_LENGTH:
132 ldrsh r10, [r12, #2] @ Filter coefficients coefficients[1]
133 ldrsh r5, [r9] @ data_out[i - 1]
134 smlabb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1];
135
136POST_LOOP2_A_LENGTH:
137 ldrsh r10, [r12] @ Filter coefficients coefficients[0]
138 ldrsh r9, [r0] @ data_in[i]
139 smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i];
140 sub r6, r7 @ output1 -= sum1;
141 sub r6, r8 @ output1 -= sum1;
142 sbfx r8, r6, #12, #16
143 ssat r7, #16, r6, asr #12
144 cmp r7, r8
145 addeq r6, r6, #2048
146 ssat r6, #16, r6, asr #12
147 strh r6, [r1] @ Store the data_out[i]
148
149END:
150 pop {r4-r11}
151 bx lr
152
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000153@Reference C code:
154@
155@void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
156@ int16_t* data_out,
157@ int16_t* __restrict coefficients,
158@ int coefficients_length,
159@ int data_length) {
160@ int i = 0;
161@ int j = 0;
162@
163@ for (i = 0; i < data_length - 1; i += 2) {
164@ int32_t output1 = 0;
165@ int32_t sum1 = 0;
166@ int32_t output2 = 0;
167@ int32_t sum2 = 0;
168@
169@ for (j = coefficients_length - 1; j > 2; j -= 2) {
170@ sum1 += coefficients[j] * data_out[i - j];
171@ sum1 += coefficients[j - 1] * data_out[i - j + 1];
172@ sum2 += coefficients[j] * data_out[i - j + 1];
173@ sum2 += coefficients[j - 1] * data_out[i - j + 2];
174@ }
175@
176@ if (j == 2) {
177@ sum1 += coefficients[2] * data_out[i - 2];
178@ sum2 += coefficients[2] * data_out[i - 1];
179@ }
180@
181@ sum1 += coefficients[1] * data_out[i - 1];
182@ output1 = coefficients[0] * data_in[i];
183@ output1 -= sum1;
184@ // Saturate and store the output.
185@ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
186@ data_out[i] = (int16_t)((output1 + 2048) >> 12);
187@
188@ sum2 += coefficients[1] * data_out[i];
189@ output2 = coefficients[0] * data_in[i + 1];
190@ output2 -= sum2;
191@ // Saturate and store the output.
192@ output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728);
193@ data_out[i + 1] = (int16_t)((output2 + 2048) >> 12);
194@ }
195@
196@ if (i == data_length - 1) {
197@ int32_t output1 = 0;
198@ int32_t sum1 = 0;
199@
200@ for (j = coefficients_length - 1; j > 1; j -= 2) {
201@ sum1 += coefficients[j] * data_out[i - j];
202@ sum1 += coefficients[j - 1] * data_out[i - j + 1];
203@ }
204@
205@ if (j == 1) {
206@ sum1 += coefficients[1] * data_out[i - 1];
207@ }
208@
209@ output1 = coefficients[0] * data_in[i];
210@ output1 -= sum1;
211@ // Saturate and store the output.
212@ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
213@ data_out[i] = (int16_t)((output1 + 2048) >> 12);
214@ }
215@}