andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 1 | @ |
| 2 | @ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| 3 | @ |
| 4 | @ Use of this source code is governed by a BSD-style license |
| 5 | @ that can be found in the LICENSE file in the root of the source |
| 6 | @ tree. An additional intellectual property rights grant can be found |
| 7 | @ in the file PATENTS. All contributing project authors may |
| 8 | @ be found in the AUTHORS file in the root of the source tree. |
| 9 | @ |
| 10 | |
| 11 | @ cross_correlation_neon.s |
| 12 | @ This file contains the function WebRtcSpl_CrossCorrelationNeon(), |
| 13 | @ optimized for ARM Neon platform. |
| 14 | @ |
| 15 | @ Reference Ccode at end of this file. |
| 16 | @ Output is bit-exact with the reference C code, but not with the generic |
| 17 | @ C code in file cross_correlation.c, due to reduction of shift operations |
| 18 | @ from using Neon registers. |
| 19 | |
| 20 | @ Register usage: |
| 21 | @ |
| 22 | @ r0: *cross_correlation (function argument) |
| 23 | @ r1: *seq1 (function argument) |
| 24 | @ r2: *seq2 (function argument) |
| 25 | @ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ |
| 26 | @ r4: counter for LOOP_DIM_CROSS_CORRELATION |
| 27 | @ r5: seq2_ptr |
| 28 | @ r6: seq1_ptr |
| 29 | @ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL |
| 30 | @ r8, r9, r10, r11, r12: scratch |
| 31 | |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 32 | #include "webrtc/system_wrappers/interface/asm_defines.h" |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 33 | |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 34 | GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 35 | .align 2 |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 36 | DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 37 | push {r4-r11} |
| 38 | |
| 39 | @ Put the shift value (-right_shifts) into a Neon register. |
| 40 | ldrsh r10, [sp, #36] |
| 41 | rsb r10, r10, #0 |
| 42 | mov r8, r10, asr #31 |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 43 | vmov d16, r10, r8 |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 44 | |
| 45 | @ Initialize loop counters. |
| 46 | and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8; |
| 47 | asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8; |
| 48 | ldrsh r4, [sp, #32] @ dim_cross_correlation |
| 49 | |
| 50 | LOOP_DIM_CROSS_CORRELATION: |
| 51 | vmov.i32 q9, #0 |
| 52 | vmov.i32 q14, #0 |
| 53 | movs r8, r3 @ inner_loop_len1 |
| 54 | mov r6, r1 @ seq1_ptr |
| 55 | mov r5, r2 @ seq2_ptr |
| 56 | ble POST_LOOP_DIM_SEQ |
| 57 | |
| 58 | LOOP_DIM_SEQ: |
| 59 | vld1.16 {d20, d21}, [r6]! @ seq1_ptr |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 60 | vld1.16 {d22, d23}, [r5]! @ seq2_ptr |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 61 | subs r8, r8, #1 |
| 62 | vmull.s16 q12, d20, d22 |
| 63 | vmull.s16 q13, d21, d23 |
| 64 | vpadal.s32 q9, q12 |
| 65 | vpadal.s32 q14, q13 |
| 66 | bgt LOOP_DIM_SEQ |
| 67 | |
| 68 | POST_LOOP_DIM_SEQ: |
| 69 | movs r10, r7 @ Loop counter |
| 70 | mov r12, #0 |
| 71 | mov r8, #0 |
| 72 | ble POST_LOOP_DIM_SEQ_RESIDUAL |
| 73 | |
| 74 | LOOP_DIM_SEQ_RESIDUAL: |
| 75 | ldrh r11, [r6], #2 |
| 76 | ldrh r9, [r5], #2 |
| 77 | smulbb r11, r11, r9 |
| 78 | adds r8, r8, r11 |
| 79 | adc r12, r12, r11, asr #31 |
| 80 | subs r10, #1 |
| 81 | bgt LOOP_DIM_SEQ_RESIDUAL |
| 82 | |
| 83 | POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift. |
| 84 | vadd.i64 d18, d19 |
| 85 | vadd.i64 d28, d29 |
| 86 | vadd.i64 d18, d28 |
| 87 | vmov.32 d17[0], r8 |
| 88 | vmov.32 d17[1], r12 |
| 89 | vadd.i64 d17, d18 |
| 90 | vshl.s64 d17, d16 |
| 91 | vst1.32 d17[0], [r0]! @ Store the output |
| 92 | |
| 93 | ldr r8, [sp, #40] @ step_seq2 |
| 94 | add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop. |
| 95 | |
| 96 | subs r4, #1 |
| 97 | bgt LOOP_DIM_CROSS_CORRELATION |
| 98 | |
| 99 | pop {r4-r11} |
| 100 | bx lr |
| 101 | |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 102 | @ TODO(kma): Place this piece of reference code into a C code file. |
pbos@webrtc.org | 1727dc7 | 2013-04-09 16:40:28 +0000 | [diff] [blame] | 103 | @ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation, |
| 104 | @ int16_t* seq1, |
| 105 | @ int16_t* seq2, |
| 106 | @ int16_t dim_seq, |
| 107 | @ int16_t dim_cross_correlation, |
| 108 | @ int16_t right_shifts, |
| 109 | @ int16_t step_seq2) { |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 110 | @ int i = 0; |
| 111 | @ int j = 0; |
| 112 | @ int inner_loop_len1 = dim_seq >> 3; |
| 113 | @ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3); |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 114 | @ |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 115 | @ assert(dim_cross_correlation > 0); |
| 116 | @ assert(dim_seq > 0); |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 117 | @ |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 118 | @ for (i = 0; i < dim_cross_correlation; i++) { |
| 119 | @ int16_t *seq1_ptr = seq1; |
| 120 | @ int16_t *seq2_ptr = seq2 + (step_seq2 * i); |
| 121 | @ int64_t sum = 0; |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 122 | @ |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 123 | @ for (j = inner_loop_len1; j > 0; j -= 1) { |
| 124 | @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| 125 | @ seq1_ptr++; |
| 126 | @ seq2_ptr++; |
| 127 | @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| 128 | @ seq1_ptr++; |
| 129 | @ seq2_ptr++; |
| 130 | @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| 131 | @ seq1_ptr++; |
| 132 | @ seq2_ptr++; |
| 133 | @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| 134 | @ seq1_ptr++; |
| 135 | @ seq2_ptr++; |
| 136 | @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| 137 | @ seq1_ptr++; |
| 138 | @ seq2_ptr++; |
| 139 | @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| 140 | @ seq1_ptr++; |
| 141 | @ seq2_ptr++; |
| 142 | @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| 143 | @ seq1_ptr++; |
| 144 | @ seq2_ptr++; |
| 145 | @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| 146 | @ seq1_ptr++; |
| 147 | @ seq2_ptr++; |
| 148 | @ } |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 149 | @ |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 150 | @ // Calculate the rest of the samples. |
| 151 | @ for (j = inner_loop_len2; j > 0; j -= 1) { |
| 152 | @ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); |
| 153 | @ seq1_ptr++; |
| 154 | @ seq2_ptr++; |
| 155 | @ } |
kma@webrtc.org | 9fc6250 | 2012-11-17 00:22:46 +0000 | [diff] [blame] | 156 | @ |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 157 | @ *cross_correlation++ = (int32_t)(sum >> right_shifts); |
| 158 | @ } |
| 159 | @ } |