andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 1 | @ |
| 2 | @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
| 3 | @ |
| 4 | @ Use of this source code is governed by a BSD-style license |
| 5 | @ that can be found in the LICENSE file in the root of the source |
| 6 | @ tree. An additional intellectual property rights grant can be found |
| 7 | @ in the file PATENTS. All contributing project authors may |
| 8 | @ be found in the AUTHORS file in the root of the source tree. |
| 9 | @ |
| 10 | |
| 11 | @ vector_scaling_operations_neon.s |
| 12 | @ This file contains the function WebRtcSpl_ScaleAndAddVectorsWithRoundNeon(), |
| 13 | @ optimized for ARM Neon platform. Output is bit-exact with the reference |
| 14 | @ C code in vector_scaling_operations.c. |
| 15 | |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 16 | #include "webrtc/system_wrappers/interface/asm_defines.h" |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 17 | |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 18 | GLOBAL_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 19 | .align 2 |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 20 | DEFINE_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 21 | push {r4-r9} |
| 22 | |
| 23 | ldr r4, [sp, #32] @ length |
| 24 | ldr r5, [sp, #28] @ out_vector |
| 25 | ldrsh r6, [sp, #24] @ right_shifts |
| 26 | |
| 27 | cmp r4, #0 |
| 28 | ble END @ Return if length <= 0. |
| 29 | |
| 30 | cmp r4, #8 |
| 31 | blt SET_ROUND_VALUE |
| 32 | |
| 33 | vdup.16 d26, r1 @ in_vector1_scale |
| 34 | vdup.16 d27, r3 @ in_vector2_scale |
| 35 | |
| 36 | @ Neon instructions can only right shift by an immediate value. To shift right |
| 37 | @ by a register value, we have to do a left shift left by the negative value. |
| 38 | rsb r7, r6, #0 |
| 39 | vdup.16 q12, r7 @ -right_shifts |
| 40 | |
| 41 | bic r7, r4, #7 @ Counter for LOOP_UNROLLED_BY_8: length / 8 * 8. |
| 42 | |
| 43 | LOOP_UNROLLED_BY_8: |
| 44 | vld1.16 {d28, d29}, [r0]! @ in_vector1[] |
| 45 | vld1.16 {d30, d31}, [r2]! @ in_vector2[] |
| 46 | vmull.s16 q0, d28, d26 |
| 47 | vmull.s16 q1, d29, d26 |
| 48 | vmull.s16 q2, d30, d27 |
| 49 | vmull.s16 q3, d31, d27 |
| 50 | vadd.s32 q0, q2 |
| 51 | vadd.s32 q1, q3 |
| 52 | vrshl.s32 q0, q12 @ Round shift right by right_shifts. |
| 53 | vrshl.s32 q1, q12 |
| 54 | vmovn.i32 d0, q0 @ Cast to 16 bit values. |
| 55 | vmovn.i32 d1, q1 |
| 56 | subs r7, #8 |
| 57 | vst1.16 {d0, d1}, [r5]! |
| 58 | bgt LOOP_UNROLLED_BY_8 |
| 59 | |
| 60 | ands r4, #0xFF @ Counter for LOOP_NO_UNROLLING: length % 8. |
| 61 | beq END |
| 62 | |
| 63 | SET_ROUND_VALUE: |
| 64 | mov r9, #1 |
| 65 | lsl r9, r6 |
| 66 | lsr r9, #1 |
| 67 | |
| 68 | LOOP_NO_UNROLLING: |
| 69 | ldrh r7, [r0], #2 |
| 70 | ldrh r8, [r2], #2 |
| 71 | smulbb r7, r7, r1 |
| 72 | smulbb r8, r8, r3 |
| 73 | subs r4, #1 |
| 74 | add r7, r9 |
| 75 | add r7, r8 |
| 76 | asr r7, r6 |
| 77 | strh r7, [r5], #2 |
| 78 | bne LOOP_NO_UNROLLING |
| 79 | |
| 80 | END: |
| 81 | pop {r4-r9} |
| 82 | bx lr |