David Rowe | 10602db | 2008-10-06 21:41:46 -0700 | [diff] [blame] | 1 | /* |
| 2 | * SpanDSP - a series of DSP components for telephony |
| 3 | * |
| 4 | * fir.h - General telephony FIR routines |
| 5 | * |
| 6 | * Written by Steve Underwood <steveu@coppice.org> |
| 7 | * |
| 8 | * Copyright (C) 2002 Steve Underwood |
| 9 | * |
| 10 | * All rights reserved. |
| 11 | * |
| 12 | * This program is free software; you can redistribute it and/or modify |
| 13 | * it under the terms of the GNU General Public License version 2, as |
| 14 | * published by the Free Software Foundation. |
| 15 | * |
| 16 | * This program is distributed in the hope that it will be useful, |
| 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 19 | * GNU General Public License for more details. |
| 20 | * |
| 21 | * You should have received a copy of the GNU General Public License |
| 22 | * along with this program; if not, write to the Free Software |
| 23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 24 | * |
| 25 | * $Id: fir.h,v 1.8 2006/10/24 13:45:28 steveu Exp $ |
| 26 | */ |
| 27 | |
| 28 | /*! \page fir_page FIR filtering |
| 29 | \section fir_page_sec_1 What does it do? |
| 30 | ???. |
| 31 | |
| 32 | \section fir_page_sec_2 How does it work? |
| 33 | ???. |
| 34 | */ |
| 35 | |
| 36 | #if !defined(_FIR_H_) |
| 37 | #define _FIR_H_ |
| 38 | |
| 39 | /* |
| 40 | Blackfin NOTES & IDEAS: |
| 41 | |
| 42 | A simple dot product function is used to implement the filter. This performs |
| 43 | just one MAC/cycle which is inefficient but was easy to implement as a first |
| 44 | pass. The current Blackfin code also uses an unrolled form of the filter |
| 45 | history to avoid 0 length hardware loop issues. This is wasteful of |
| 46 | memory. |
| 47 | |
| 48 | Ideas for improvement: |
| 49 | |
| 50 | 1/ Rewrite filter for dual MAC inner loop. The issue here is handling |
| 51 | history sample offsets that are 16 bit aligned - the dual MAC needs |
| 52 | 32 bit aligmnent. There are some good examples in libbfdsp. |
| 53 | |
| 54 | 2/ Use the hardware circular buffer facility tohalve memory usage. |
| 55 | |
| 56 | 3/ Consider using internal memory. |
| 57 | |
| 58 | Using less memory might also improve speed as cache misses will be |
| 59 | reduced. A drop in MIPs and memory approaching 50% should be |
| 60 | possible. |
| 61 | |
| 62 | The foreground and background filters currenlty use a total of |
| 63 | about 10 MIPs/ch as measured with speedtest.c on a 256 TAP echo |
| 64 | can. |
| 65 | */ |
| 66 | |
| 67 | #if defined(USE_MMX) || defined(USE_SSE2) |
| 68 | #include "mmx.h" |
| 69 | #endif |
| 70 | |
| 71 | /*! |
| 72 | 16 bit integer FIR descriptor. This defines the working state for a single |
| 73 | instance of an FIR filter using 16 bit integer coefficients. |
| 74 | */ |
| 75 | typedef struct |
| 76 | { |
| 77 | int taps; |
| 78 | int curr_pos; |
| 79 | const int16_t *coeffs; |
| 80 | int16_t *history; |
| 81 | } fir16_state_t; |
| 82 | |
| 83 | /*! |
| 84 | 32 bit integer FIR descriptor. This defines the working state for a single |
| 85 | instance of an FIR filter using 32 bit integer coefficients, and filtering |
| 86 | 16 bit integer data. |
| 87 | */ |
| 88 | typedef struct |
| 89 | { |
| 90 | int taps; |
| 91 | int curr_pos; |
| 92 | const int32_t *coeffs; |
| 93 | int16_t *history; |
| 94 | } fir32_state_t; |
| 95 | |
| 96 | /*! |
| 97 | Floating point FIR descriptor. This defines the working state for a single |
| 98 | instance of an FIR filter using floating point coefficients and data. |
| 99 | */ |
| 100 | typedef struct |
| 101 | { |
| 102 | int taps; |
| 103 | int curr_pos; |
| 104 | const float *coeffs; |
| 105 | float *history; |
| 106 | } fir_float_state_t; |
| 107 | |
| 108 | #ifdef __cplusplus |
| 109 | extern "C" { |
| 110 | #endif |
| 111 | |
| 112 | static __inline__ const int16_t *fir16_create(fir16_state_t *fir, |
| 113 | const int16_t *coeffs, |
| 114 | int taps) |
| 115 | { |
| 116 | fir->taps = taps; |
| 117 | fir->curr_pos = taps - 1; |
| 118 | fir->coeffs = coeffs; |
Tzafrir Cohen | f55ccbf | 2008-10-12 08:13:21 +0200 | [diff] [blame^] | 119 | #if defined(USE_MMX) || defined(USE_SSE2) || defined(__bfin__) |
David Rowe | 10602db | 2008-10-06 21:41:46 -0700 | [diff] [blame] | 120 | if ((fir->history = malloc(2*taps*sizeof(int16_t)))) |
| 121 | memset(fir->history, 0, 2*taps*sizeof(int16_t)); |
| 122 | #else |
| 123 | if ((fir->history = (int16_t *) malloc(taps*sizeof(int16_t)))) |
| 124 | memset(fir->history, 0, taps*sizeof(int16_t)); |
| 125 | #endif |
| 126 | return fir->history; |
| 127 | } |
| 128 | /*- End of function --------------------------------------------------------*/ |
| 129 | |
| 130 | static __inline__ void fir16_flush(fir16_state_t *fir) |
| 131 | { |
Tzafrir Cohen | f55ccbf | 2008-10-12 08:13:21 +0200 | [diff] [blame^] | 132 | #if defined(USE_MMX) || defined(USE_SSE2) || defined(__bfin__) |
David Rowe | 10602db | 2008-10-06 21:41:46 -0700 | [diff] [blame] | 133 | memset(fir->history, 0, 2*fir->taps*sizeof(int16_t)); |
| 134 | #else |
| 135 | memset(fir->history, 0, fir->taps*sizeof(int16_t)); |
| 136 | #endif |
| 137 | } |
| 138 | /*- End of function --------------------------------------------------------*/ |
| 139 | |
| 140 | static __inline__ void fir16_free(fir16_state_t *fir) |
| 141 | { |
| 142 | free(fir->history); |
| 143 | } |
| 144 | /*- End of function --------------------------------------------------------*/ |
| 145 | |
Tzafrir Cohen | f55ccbf | 2008-10-12 08:13:21 +0200 | [diff] [blame^] | 146 | #ifdef __bfin__ |
David Rowe | 10602db | 2008-10-06 21:41:46 -0700 | [diff] [blame] | 147 | static inline int32_t dot_asm(short *x, short *y, int len) |
| 148 | { |
| 149 | int dot; |
| 150 | |
| 151 | len--; |
| 152 | |
| 153 | __asm__ |
| 154 | ( |
| 155 | "I0 = %1;\n\t" |
| 156 | "I1 = %2;\n\t" |
| 157 | "A0 = 0;\n\t" |
| 158 | "R0.L = W[I0++] || R1.L = W[I1++];\n\t" |
| 159 | "LOOP dot%= LC0 = %3;\n\t" |
| 160 | "LOOP_BEGIN dot%=;\n\t" |
| 161 | "A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t" |
| 162 | "LOOP_END dot%=;\n\t" |
| 163 | "A0 += R0.L*R1.L (IS);\n\t" |
| 164 | "R0 = A0;\n\t" |
| 165 | "%0 = R0;\n\t" |
| 166 | : "=&d" (dot) |
| 167 | : "a" (x), "a" (y), "a" (len) |
| 168 | : "I0", "I1", "A1", "A0", "R0", "R1" |
| 169 | ); |
| 170 | |
| 171 | return dot; |
| 172 | } |
| 173 | #endif |
| 174 | /*- End of function --------------------------------------------------------*/ |
| 175 | |
| 176 | static __inline__ int16_t fir16(fir16_state_t *fir, int16_t sample) |
| 177 | { |
| 178 | int32_t y; |
| 179 | #if defined(USE_MMX) |
| 180 | int i; |
| 181 | mmx_t *mmx_coeffs; |
| 182 | mmx_t *mmx_hist; |
| 183 | |
| 184 | fir->history[fir->curr_pos] = sample; |
| 185 | fir->history[fir->curr_pos + fir->taps] = sample; |
| 186 | |
| 187 | mmx_coeffs = (mmx_t *) fir->coeffs; |
| 188 | mmx_hist = (mmx_t *) &fir->history[fir->curr_pos]; |
| 189 | i = fir->taps; |
| 190 | pxor_r2r(mm4, mm4); |
| 191 | /* 8 samples per iteration, so the filter must be a multiple of 8 long. */ |
| 192 | while (i > 0) |
| 193 | { |
| 194 | movq_m2r(mmx_coeffs[0], mm0); |
| 195 | movq_m2r(mmx_coeffs[1], mm2); |
| 196 | movq_m2r(mmx_hist[0], mm1); |
| 197 | movq_m2r(mmx_hist[1], mm3); |
| 198 | mmx_coeffs += 2; |
| 199 | mmx_hist += 2; |
| 200 | pmaddwd_r2r(mm1, mm0); |
| 201 | pmaddwd_r2r(mm3, mm2); |
| 202 | paddd_r2r(mm0, mm4); |
| 203 | paddd_r2r(mm2, mm4); |
| 204 | i -= 8; |
| 205 | } |
| 206 | movq_r2r(mm4, mm0); |
| 207 | psrlq_i2r(32, mm0); |
| 208 | paddd_r2r(mm0, mm4); |
| 209 | movd_r2m(mm4, y); |
| 210 | emms(); |
| 211 | #elif defined(USE_SSE2) |
| 212 | int i; |
| 213 | xmm_t *xmm_coeffs; |
| 214 | xmm_t *xmm_hist; |
| 215 | |
| 216 | fir->history[fir->curr_pos] = sample; |
| 217 | fir->history[fir->curr_pos + fir->taps] = sample; |
| 218 | |
| 219 | xmm_coeffs = (xmm_t *) fir->coeffs; |
| 220 | xmm_hist = (xmm_t *) &fir->history[fir->curr_pos]; |
| 221 | i = fir->taps; |
| 222 | pxor_r2r(xmm4, xmm4); |
| 223 | /* 16 samples per iteration, so the filter must be a multiple of 16 long. */ |
| 224 | while (i > 0) |
| 225 | { |
| 226 | movdqu_m2r(xmm_coeffs[0], xmm0); |
| 227 | movdqu_m2r(xmm_coeffs[1], xmm2); |
| 228 | movdqu_m2r(xmm_hist[0], xmm1); |
| 229 | movdqu_m2r(xmm_hist[1], xmm3); |
| 230 | xmm_coeffs += 2; |
| 231 | xmm_hist += 2; |
| 232 | pmaddwd_r2r(xmm1, xmm0); |
| 233 | pmaddwd_r2r(xmm3, xmm2); |
| 234 | paddd_r2r(xmm0, xmm4); |
| 235 | paddd_r2r(xmm2, xmm4); |
| 236 | i -= 16; |
| 237 | } |
| 238 | movdqa_r2r(xmm4, xmm0); |
| 239 | psrldq_i2r(8, xmm0); |
| 240 | paddd_r2r(xmm0, xmm4); |
| 241 | movdqa_r2r(xmm4, xmm0); |
| 242 | psrldq_i2r(4, xmm0); |
| 243 | paddd_r2r(xmm0, xmm4); |
| 244 | movd_r2m(xmm4, y); |
Tzafrir Cohen | f55ccbf | 2008-10-12 08:13:21 +0200 | [diff] [blame^] | 245 | #elif defined(__bfin__) |
David Rowe | 10602db | 2008-10-06 21:41:46 -0700 | [diff] [blame] | 246 | fir->history[fir->curr_pos] = sample; |
| 247 | fir->history[fir->curr_pos + fir->taps] = sample; |
| 248 | y = dot_asm((int16_t*)fir->coeffs, &fir->history[fir->curr_pos], fir->taps); |
| 249 | #else |
| 250 | int i; |
| 251 | int offset1; |
| 252 | int offset2; |
| 253 | |
| 254 | fir->history[fir->curr_pos] = sample; |
| 255 | |
| 256 | offset2 = fir->curr_pos; |
| 257 | offset1 = fir->taps - offset2; |
| 258 | y = 0; |
| 259 | for (i = fir->taps - 1; i >= offset1; i--) |
| 260 | y += fir->coeffs[i]*fir->history[i - offset1]; |
| 261 | for ( ; i >= 0; i--) |
| 262 | y += fir->coeffs[i]*fir->history[i + offset2]; |
| 263 | #endif |
| 264 | if (fir->curr_pos <= 0) |
| 265 | fir->curr_pos = fir->taps; |
| 266 | fir->curr_pos--; |
| 267 | return (int16_t) (y >> 15); |
| 268 | } |
| 269 | /*- End of function --------------------------------------------------------*/ |
| 270 | |
| 271 | static __inline__ const int16_t *fir32_create(fir32_state_t *fir, |
| 272 | const int32_t *coeffs, |
| 273 | int taps) |
| 274 | { |
| 275 | fir->taps = taps; |
| 276 | fir->curr_pos = taps - 1; |
| 277 | fir->coeffs = coeffs; |
| 278 | fir->history = (int16_t *) malloc(taps*sizeof(int16_t)); |
| 279 | if (fir->history) |
| 280 | memset(fir->history, '\0', taps*sizeof(int16_t)); |
| 281 | return fir->history; |
| 282 | } |
| 283 | /*- End of function --------------------------------------------------------*/ |
| 284 | |
| 285 | static __inline__ void fir32_flush(fir32_state_t *fir) |
| 286 | { |
| 287 | memset(fir->history, 0, fir->taps*sizeof(int16_t)); |
| 288 | } |
| 289 | /*- End of function --------------------------------------------------------*/ |
| 290 | |
| 291 | static __inline__ void fir32_free(fir32_state_t *fir) |
| 292 | { |
| 293 | free(fir->history); |
| 294 | } |
| 295 | /*- End of function --------------------------------------------------------*/ |
| 296 | |
| 297 | static __inline__ int16_t fir32(fir32_state_t *fir, int16_t sample) |
| 298 | { |
| 299 | int i; |
| 300 | int32_t y; |
| 301 | int offset1; |
| 302 | int offset2; |
| 303 | |
| 304 | fir->history[fir->curr_pos] = sample; |
| 305 | offset2 = fir->curr_pos; |
| 306 | offset1 = fir->taps - offset2; |
| 307 | y = 0; |
| 308 | for (i = fir->taps - 1; i >= offset1; i--) |
| 309 | y += fir->coeffs[i]*fir->history[i - offset1]; |
| 310 | for ( ; i >= 0; i--) |
| 311 | y += fir->coeffs[i]*fir->history[i + offset2]; |
| 312 | if (fir->curr_pos <= 0) |
| 313 | fir->curr_pos = fir->taps; |
| 314 | fir->curr_pos--; |
| 315 | return (int16_t) (y >> 15); |
| 316 | } |
| 317 | /*- End of function --------------------------------------------------------*/ |
| 318 | |
| 319 | #ifndef __KERNEL__ |
| 320 | static __inline__ const float *fir_float_create(fir_float_state_t *fir, |
| 321 | const float *coeffs, |
| 322 | int taps) |
| 323 | { |
| 324 | fir->taps = taps; |
| 325 | fir->curr_pos = taps - 1; |
| 326 | fir->coeffs = coeffs; |
| 327 | fir->history = (float *) malloc(taps*sizeof(float)); |
| 328 | if (fir->history) |
| 329 | memset(fir->history, '\0', taps*sizeof(float)); |
| 330 | return fir->history; |
| 331 | } |
| 332 | /*- End of function --------------------------------------------------------*/ |
| 333 | |
| 334 | static __inline__ void fir_float_free(fir_float_state_t *fir) |
| 335 | { |
| 336 | free(fir->history); |
| 337 | } |
| 338 | /*- End of function --------------------------------------------------------*/ |
| 339 | |
| 340 | static __inline__ int16_t fir_float(fir_float_state_t *fir, int16_t sample) |
| 341 | { |
| 342 | int i; |
| 343 | float y; |
| 344 | int offset1; |
| 345 | int offset2; |
| 346 | |
| 347 | fir->history[fir->curr_pos] = sample; |
| 348 | |
| 349 | offset2 = fir->curr_pos; |
| 350 | offset1 = fir->taps - offset2; |
| 351 | y = 0; |
| 352 | for (i = fir->taps - 1; i >= offset1; i--) |
| 353 | y += fir->coeffs[i]*fir->history[i - offset1]; |
| 354 | for ( ; i >= 0; i--) |
| 355 | y += fir->coeffs[i]*fir->history[i + offset2]; |
| 356 | if (fir->curr_pos <= 0) |
| 357 | fir->curr_pos = fir->taps; |
| 358 | fir->curr_pos--; |
| 359 | return (int16_t) y; |
| 360 | } |
| 361 | /*- End of function --------------------------------------------------------*/ |
| 362 | #endif |
| 363 | |
| 364 | #ifdef __cplusplus |
| 365 | } |
| 366 | #endif |
| 367 | |
| 368 | #endif |
| 369 | /*- End of file ------------------------------------------------------------*/ |