blob: 15b25b8f5bdbfe6c05766c764d0628f2f7cec0d2 [file] [log] [blame]
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +00001@
2@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS. All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10
11@ cross_correlation_neon.s
12@ This file contains the function WebRtcSpl_CrossCorrelationNeon(),
13@ optimized for ARM Neon platform.
14@
15@ Reference Ccode at end of this file.
16@ Output is bit-exact with the reference C code, but not with the generic
17@ C code in file cross_correlation.c, due to reduction of shift operations
18@ from using Neon registers.
19
20@ Register usage:
21@
22@ r0: *cross_correlation (function argument)
23@ r1: *seq1 (function argument)
24@ r2: *seq2 (function argument)
25@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
26@ r4: counter for LOOP_DIM_CROSS_CORRELATION
27@ r5: seq2_ptr
28@ r6: seq1_ptr
29@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
30@ r8, r9, r10, r11, r12: scratch
31
kma@webrtc.org9fc62502012-11-17 00:22:46 +000032#include "webrtc/system_wrappers/interface/asm_defines.h"
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000033
kma@webrtc.org9fc62502012-11-17 00:22:46 +000034GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000035.align 2
kma@webrtc.org9fc62502012-11-17 00:22:46 +000036DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000037 push {r4-r11}
38
39 @ Put the shift value (-right_shifts) into a Neon register.
40 ldrsh r10, [sp, #36]
41 rsb r10, r10, #0
42 mov r8, r10, asr #31
kma@webrtc.org9fc62502012-11-17 00:22:46 +000043 vmov d16, r10, r8
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000044
45 @ Initialize loop counters.
46 and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8;
47 asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8;
48 ldrsh r4, [sp, #32] @ dim_cross_correlation
49
50LOOP_DIM_CROSS_CORRELATION:
51 vmov.i32 q9, #0
52 vmov.i32 q14, #0
53 movs r8, r3 @ inner_loop_len1
54 mov r6, r1 @ seq1_ptr
55 mov r5, r2 @ seq2_ptr
56 ble POST_LOOP_DIM_SEQ
57
58LOOP_DIM_SEQ:
59 vld1.16 {d20, d21}, [r6]! @ seq1_ptr
kma@webrtc.org9fc62502012-11-17 00:22:46 +000060 vld1.16 {d22, d23}, [r5]! @ seq2_ptr
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000061 subs r8, r8, #1
62 vmull.s16 q12, d20, d22
63 vmull.s16 q13, d21, d23
64 vpadal.s32 q9, q12
65 vpadal.s32 q14, q13
66 bgt LOOP_DIM_SEQ
67
68POST_LOOP_DIM_SEQ:
69 movs r10, r7 @ Loop counter
70 mov r12, #0
71 mov r8, #0
72 ble POST_LOOP_DIM_SEQ_RESIDUAL
73
74LOOP_DIM_SEQ_RESIDUAL:
75 ldrh r11, [r6], #2
76 ldrh r9, [r5], #2
77 smulbb r11, r11, r9
78 adds r8, r8, r11
79 adc r12, r12, r11, asr #31
80 subs r10, #1
81 bgt LOOP_DIM_SEQ_RESIDUAL
82
83POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
84 vadd.i64 d18, d19
85 vadd.i64 d28, d29
86 vadd.i64 d18, d28
87 vmov.32 d17[0], r8
88 vmov.32 d17[1], r12
89 vadd.i64 d17, d18
90 vshl.s64 d17, d16
91 vst1.32 d17[0], [r0]! @ Store the output
92
93 ldr r8, [sp, #40] @ step_seq2
94 add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop.
95
96 subs r4, #1
97 bgt LOOP_DIM_CROSS_CORRELATION
98
99 pop {r4-r11}
100 bx lr
101
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000102@ TODO(kma): Place this piece of reference code into a C code file.
pbos@webrtc.org1727dc72013-04-09 16:40:28 +0000103@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
104@ int16_t* seq1,
105@ int16_t* seq2,
106@ int16_t dim_seq,
107@ int16_t dim_cross_correlation,
108@ int16_t right_shifts,
109@ int16_t step_seq2) {
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000110@ int i = 0;
111@ int j = 0;
112@ int inner_loop_len1 = dim_seq >> 3;
113@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
kma@webrtc.org9fc62502012-11-17 00:22:46 +0000114@
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000115@ assert(dim_cross_correlation > 0);
116@ assert(dim_seq > 0);
kma@webrtc.org9fc62502012-11-17 00:22:46 +0000117@
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000118@ for (i = 0; i < dim_cross_correlation; i++) {
119@ int16_t *seq1_ptr = seq1;
120@ int16_t *seq2_ptr = seq2 + (step_seq2 * i);
121@ int64_t sum = 0;
kma@webrtc.org9fc62502012-11-17 00:22:46 +0000122@
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000123@ for (j = inner_loop_len1; j > 0; j -= 1) {
124@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
125@ seq1_ptr++;
126@ seq2_ptr++;
127@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
128@ seq1_ptr++;
129@ seq2_ptr++;
130@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
131@ seq1_ptr++;
132@ seq2_ptr++;
133@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
134@ seq1_ptr++;
135@ seq2_ptr++;
136@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
137@ seq1_ptr++;
138@ seq2_ptr++;
139@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
140@ seq1_ptr++;
141@ seq2_ptr++;
142@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
143@ seq1_ptr++;
144@ seq2_ptr++;
145@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
146@ seq1_ptr++;
147@ seq2_ptr++;
148@ }
kma@webrtc.org9fc62502012-11-17 00:22:46 +0000149@
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000150@ // Calculate the rest of the samples.
151@ for (j = inner_loop_len2; j > 0; j -= 1) {
152@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
153@ seq1_ptr++;
154@ seq2_ptr++;
155@ }
kma@webrtc.org9fc62502012-11-17 00:22:46 +0000156@
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000157@ *cross_correlation++ = (int32_t)(sum >> right_shifts);
158@ }
159@ }