blob: 07db741b009e652a368a7af530b4c55b594fb376 [file] [log] [blame]
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +00001@
2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS. All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10
11@ vector_scaling_operations_neon.s
12@ This file contains the function WebRtcSpl_ScaleAndAddVectorsWithRoundNeon(),
13@ optimized for ARM Neon platform. Output is bit-exact with the reference
14@ C code in vector_scaling_operations.c.
15
kma@webrtc.org9fc62502012-11-17 00:22:46 +000016#include "webrtc/system_wrappers/interface/asm_defines.h"
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000017
kma@webrtc.org9fc62502012-11-17 00:22:46 +000018GLOBAL_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000019.align 2
kma@webrtc.org9fc62502012-11-17 00:22:46 +000020DEFINE_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000021 push {r4-r9}
22
23 ldr r4, [sp, #32] @ length
24 ldr r5, [sp, #28] @ out_vector
25 ldrsh r6, [sp, #24] @ right_shifts
26
27 cmp r4, #0
28 ble END @ Return if length <= 0.
29
30 cmp r4, #8
31 blt SET_ROUND_VALUE
32
33 vdup.16 d26, r1 @ in_vector1_scale
34 vdup.16 d27, r3 @ in_vector2_scale
35
36 @ Neon instructions can only right shift by an immediate value. To shift right
37 @ by a register value, we have to do a left shift left by the negative value.
38 rsb r7, r6, #0
39 vdup.16 q12, r7 @ -right_shifts
40
41 bic r7, r4, #7 @ Counter for LOOP_UNROLLED_BY_8: length / 8 * 8.
42
43LOOP_UNROLLED_BY_8:
44 vld1.16 {d28, d29}, [r0]! @ in_vector1[]
45 vld1.16 {d30, d31}, [r2]! @ in_vector2[]
46 vmull.s16 q0, d28, d26
47 vmull.s16 q1, d29, d26
48 vmull.s16 q2, d30, d27
49 vmull.s16 q3, d31, d27
50 vadd.s32 q0, q2
51 vadd.s32 q1, q3
52 vrshl.s32 q0, q12 @ Round shift right by right_shifts.
53 vrshl.s32 q1, q12
54 vmovn.i32 d0, q0 @ Cast to 16 bit values.
55 vmovn.i32 d1, q1
56 subs r7, #8
57 vst1.16 {d0, d1}, [r5]!
58 bgt LOOP_UNROLLED_BY_8
59
60 ands r4, #0xFF @ Counter for LOOP_NO_UNROLLING: length % 8.
61 beq END
62
63SET_ROUND_VALUE:
64 mov r9, #1
65 lsl r9, r6
66 lsr r9, #1
67
68LOOP_NO_UNROLLING:
69 ldrh r7, [r0], #2
70 ldrh r8, [r2], #2
71 smulbb r7, r7, r1
72 smulbb r8, r8, r3
73 subs r4, #1
74 add r7, r9
75 add r7, r8
76 asr r7, r6
77 strh r7, [r5], #2
78 bne LOOP_NO_UNROLLING
79
80END:
81 pop {r4-r9}
82 bx lr