andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 1 | @ |
| 2 | @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
| 3 | @ |
| 4 | @ Use of this source code is governed by a BSD-style license |
| 5 | @ that can be found in the LICENSE file in the root of the source |
| 6 | @ tree. An additional intellectual property rights grant can be found |
| 7 | @ in the file PATENTS. All contributing project authors may |
| 8 | @ be found in the AUTHORS file in the root of the source tree. |
| 9 | @ |
| 10 | |
| 11 | @ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for |
| 12 | @ ARMv7 platform. The description header can be found in |
| 13 | @ signal_processing_library.h |
| 14 | @ |
| 15 | @ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and |
| 16 | @ the reference C code at end of this file. |
| 17 | |
| 18 | @ Assumptions: |
| 19 | @ (1) data_length > 0 |
| 20 | @ (2) coefficients_length > 1 |
| 21 | |
| 22 | @ Register usage: |
| 23 | @ |
| 24 | @ r0: &data_in[i] |
| 25 | @ r1: &data_out[i], for result ouput |
| 26 | @ r2: &coefficients[0] |
| 27 | @ r3: coefficients_length |
| 28 | @ r4: Iteration counter for the outer loop. |
| 29 | @ r5: data_out[j] as multiplication inputs |
| 30 | @ r6: Calculated value for output data_out[]; interation counter for inner loop |
| 31 | @ r7: Partial sum of a filtering multiplication results |
| 32 | @ r8: Partial sum of a filtering multiplication results |
| 33 | @ r9: &data_out[], for filtering input; data_in[i] |
| 34 | @ r10: coefficients[j] |
| 35 | @ r11: Scratch |
| 36 | @ r12: &coefficients[j] |
| 37 | |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 38 | #include "webrtc/system_wrappers/interface/asm_defines.h" |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 39 | |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 40 | GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12 |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 41 | .align 2 |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 42 | DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12 |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 43 | push {r4-r11} |
| 44 | |
| 45 | ldrsh r12, [sp, #32] @ data_length |
| 46 | subs r4, r12, #1 |
| 47 | beq ODD_LENGTH @ jump if data_length == 1 |
| 48 | |
| 49 | LOOP_LENGTH: |
| 50 | add r12, r2, r3, lsl #1 |
| 51 | sub r12, #4 @ &coefficients[coefficients_length - 2] |
| 52 | sub r9, r1, r3, lsl #1 |
| 53 | add r9, #2 @ &data_out[i - coefficients_length + 1] |
| 54 | ldr r5, [r9], #4 @ data_out[i - coefficients_length + {1,2}] |
| 55 | |
| 56 | mov r7, #0 @ sum1 |
| 57 | mov r8, #0 @ sum2 |
| 58 | subs r6, r3, #3 @ Iteration counter for inner loop. |
| 59 | beq ODD_A_LENGTH @ branch if coefficients_length == 3 |
| 60 | blt POST_LOOP_A_LENGTH @ branch if coefficients_length == 2 |
| 61 | |
| 62 | LOOP_A_LENGTH: |
| 63 | ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] |
| 64 | subs r6, #2 |
| 65 | smlatt r8, r10, r5, r8 @ sum2 += coefficients[j] * data_out[i - j + 1]; |
| 66 | smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; |
| 67 | smlabt r7, r10, r5, r7 @ coefficients[j - 1] * data_out[i - j + 1]; |
| 68 | ldr r5, [r9], #4 @ data_out[i - j + 2], data_out[i - j + 3] |
| 69 | smlabb r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 2]; |
| 70 | bgt LOOP_A_LENGTH |
| 71 | blt POST_LOOP_A_LENGTH |
| 72 | |
| 73 | ODD_A_LENGTH: |
| 74 | ldrsh r10, [r12, #2] @ Filter coefficients coefficients[2] |
| 75 | sub r12, #2 @ &coefficients[0] |
| 76 | smlabb r7, r10, r5, r7 @ sum1 += coefficients[2] * data_out[i - 2]; |
| 77 | smlabt r8, r10, r5, r8 @ sum2 += coefficients[2] * data_out[i - 1]; |
| 78 | ldr r5, [r9, #-2] @ data_out[i - 1], data_out[i] |
| 79 | |
| 80 | POST_LOOP_A_LENGTH: |
| 81 | ldr r10, [r12] @ coefficients[0], coefficients[1] |
| 82 | smlatb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; |
| 83 | |
| 84 | ldr r9, [r0], #4 @ data_in[i], data_in[i + 1] |
| 85 | smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; |
| 86 | sub r6, r7 @ output1 -= sum1; |
| 87 | |
| 88 | sbfx r11, r6, #12, #16 |
| 89 | ssat r7, #16, r6, asr #12 |
| 90 | cmp r7, r11 |
| 91 | addeq r6, r6, #2048 |
| 92 | ssat r6, #16, r6, asr #12 |
| 93 | strh r6, [r1], #2 @ Store data_out[i] |
| 94 | |
| 95 | smlatb r8, r10, r6, r8 @ sum2 += coefficients[1] * data_out[i]; |
| 96 | smulbt r6, r10, r9 @ output2 = coefficients[0] * data_in[i + 1]; |
| 97 | sub r6, r8 @ output1 -= sum1; |
| 98 | |
| 99 | sbfx r11, r6, #12, #16 |
| 100 | ssat r7, #16, r6, asr #12 |
| 101 | cmp r7, r11 |
| 102 | addeq r6, r6, #2048 |
| 103 | ssat r6, #16, r6, asr #12 |
| 104 | strh r6, [r1], #2 @ Store data_out[i + 1] |
| 105 | |
| 106 | subs r4, #2 |
| 107 | bgt LOOP_LENGTH |
| 108 | blt END @ For even data_length, it's done. Jump to END. |
| 109 | |
| 110 | @ Process i = data_length -1, for the case of an odd length. |
| 111 | ODD_LENGTH: |
| 112 | add r12, r2, r3, lsl #1 |
| 113 | sub r12, #4 @ &coefficients[coefficients_length - 2] |
| 114 | sub r9, r1, r3, lsl #1 |
| 115 | add r9, #2 @ &data_out[i - coefficients_length + 1] |
| 116 | mov r7, #0 @ sum1 |
| 117 | mov r8, #0 @ sum1 |
| 118 | subs r6, r3, #2 @ inner loop counter |
| 119 | beq EVEN_A_LENGTH @ branch if coefficients_length == 2 |
| 120 | |
| 121 | LOOP2_A_LENGTH: |
| 122 | ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] |
| 123 | ldr r5, [r9], #4 @ data_out[i - j], data_out[i - j + 1] |
| 124 | subs r6, #2 |
| 125 | smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; |
| 126 | smlabt r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 1]; |
| 127 | bgt LOOP2_A_LENGTH |
| 128 | addlt r12, #2 |
| 129 | blt POST_LOOP2_A_LENGTH |
| 130 | |
| 131 | EVEN_A_LENGTH: |
| 132 | ldrsh r10, [r12, #2] @ Filter coefficients coefficients[1] |
| 133 | ldrsh r5, [r9] @ data_out[i - 1] |
| 134 | smlabb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; |
| 135 | |
| 136 | POST_LOOP2_A_LENGTH: |
| 137 | ldrsh r10, [r12] @ Filter coefficients coefficients[0] |
| 138 | ldrsh r9, [r0] @ data_in[i] |
| 139 | smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; |
| 140 | sub r6, r7 @ output1 -= sum1; |
| 141 | sub r6, r8 @ output1 -= sum1; |
| 142 | sbfx r8, r6, #12, #16 |
| 143 | ssat r7, #16, r6, asr #12 |
| 144 | cmp r7, r8 |
| 145 | addeq r6, r6, #2048 |
| 146 | ssat r6, #16, r6, asr #12 |
| 147 | strh r6, [r1] @ Store the data_out[i] |
| 148 | |
| 149 | END: |
| 150 | pop {r4-r11} |
| 151 | bx lr |
| 152 | |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 153 | @Reference C code: |
| 154 | @ |
| 155 | @void WebRtcSpl_FilterARFastQ12(int16_t* data_in, |
| 156 | @ int16_t* data_out, |
| 157 | @ int16_t* __restrict coefficients, |
| 158 | @ int coefficients_length, |
| 159 | @ int data_length) { |
| 160 | @ int i = 0; |
| 161 | @ int j = 0; |
| 162 | @ |
| 163 | @ for (i = 0; i < data_length - 1; i += 2) { |
| 164 | @ int32_t output1 = 0; |
| 165 | @ int32_t sum1 = 0; |
| 166 | @ int32_t output2 = 0; |
| 167 | @ int32_t sum2 = 0; |
| 168 | @ |
| 169 | @ for (j = coefficients_length - 1; j > 2; j -= 2) { |
| 170 | @ sum1 += coefficients[j] * data_out[i - j]; |
| 171 | @ sum1 += coefficients[j - 1] * data_out[i - j + 1]; |
| 172 | @ sum2 += coefficients[j] * data_out[i - j + 1]; |
| 173 | @ sum2 += coefficients[j - 1] * data_out[i - j + 2]; |
| 174 | @ } |
| 175 | @ |
| 176 | @ if (j == 2) { |
| 177 | @ sum1 += coefficients[2] * data_out[i - 2]; |
| 178 | @ sum2 += coefficients[2] * data_out[i - 1]; |
| 179 | @ } |
| 180 | @ |
| 181 | @ sum1 += coefficients[1] * data_out[i - 1]; |
| 182 | @ output1 = coefficients[0] * data_in[i]; |
| 183 | @ output1 -= sum1; |
| 184 | @ // Saturate and store the output. |
| 185 | @ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); |
| 186 | @ data_out[i] = (int16_t)((output1 + 2048) >> 12); |
| 187 | @ |
| 188 | @ sum2 += coefficients[1] * data_out[i]; |
| 189 | @ output2 = coefficients[0] * data_in[i + 1]; |
| 190 | @ output2 -= sum2; |
| 191 | @ // Saturate and store the output. |
| 192 | @ output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728); |
| 193 | @ data_out[i + 1] = (int16_t)((output2 + 2048) >> 12); |
| 194 | @ } |
| 195 | @ |
| 196 | @ if (i == data_length - 1) { |
| 197 | @ int32_t output1 = 0; |
| 198 | @ int32_t sum1 = 0; |
| 199 | @ |
| 200 | @ for (j = coefficients_length - 1; j > 1; j -= 2) { |
| 201 | @ sum1 += coefficients[j] * data_out[i - j]; |
| 202 | @ sum1 += coefficients[j - 1] * data_out[i - j + 1]; |
| 203 | @ } |
| 204 | @ |
| 205 | @ if (j == 1) { |
| 206 | @ sum1 += coefficients[1] * data_out[i - 1]; |
| 207 | @ } |
| 208 | @ |
| 209 | @ output1 = coefficients[0] * data_in[i]; |
| 210 | @ output1 -= sum1; |
| 211 | @ // Saturate and store the output. |
| 212 | @ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); |
| 213 | @ data_out[i] = (int16_t)((output1 + 2048) >> 12); |
| 214 | @ } |
| 215 | @} |