Blame - common_audio/signal_processing/cross_correlation_neon.S - fp2-dev/platform/external/chromium_org/third_party/webrtc

blob: 15b25b8f5bdbfe6c05766c764d0628f2f7cec0d2 [file] [log] [blame]

andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	1	@
				2	@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
				3	@
				4	@ Use of this source code is governed by a BSD-style license
				5	@ that can be found in the LICENSE file in the root of the source
				6	@ tree. An additional intellectual property rights grant can be found
				7	@ in the file PATENTS. All contributing project authors may
				8	@ be found in the AUTHORS file in the root of the source tree.
				9	@
				10
				11	@ cross_correlation_neon.s
				12	@ This file contains the function WebRtcSpl_CrossCorrelationNeon(),
				13	@ optimized for ARM Neon platform.
				14	@
				15	@ Reference Ccode at end of this file.
				16	@ Output is bit-exact with the reference C code, but not with the generic
				17	@ C code in file cross_correlation.c, due to reduction of shift operations
				18	@ from using Neon registers.
				19
				20	@ Register usage:
				21	@
				22	@ r0: *cross_correlation (function argument)
				23	@ r1: *seq1 (function argument)
				24	@ r2: *seq2 (function argument)
				25	@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
				26	@ r4: counter for LOOP_DIM_CROSS_CORRELATION
				27	@ r5: seq2_ptr
				28	@ r6: seq1_ptr
				29	@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
				30	@ r8, r9, r10, r11, r12: scratch
				31
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	32	#include "webrtc/system_wrappers/interface/asm_defines.h"
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	33
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	34	GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	35	.align 2
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	36	DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	37	push {r4-r11}
				38
				39	@ Put the shift value (-right_shifts) into a Neon register.
				40	ldrsh r10, [sp, #36]
				41	rsb r10, r10, #0
				42	mov r8, r10, asr #31
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	43	vmov d16, r10, r8
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	44
				45	@ Initialize loop counters.
				46	and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8;
				47	asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8;
				48	ldrsh r4, [sp, #32] @ dim_cross_correlation
				49
				50	LOOP_DIM_CROSS_CORRELATION:
				51	vmov.i32 q9, #0
				52	vmov.i32 q14, #0
				53	movs r8, r3 @ inner_loop_len1
				54	mov r6, r1 @ seq1_ptr
				55	mov r5, r2 @ seq2_ptr
				56	ble POST_LOOP_DIM_SEQ
				57
				58	LOOP_DIM_SEQ:
				59	vld1.16 {d20, d21}, [r6]! @ seq1_ptr
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	60	vld1.16 {d22, d23}, [r5]! @ seq2_ptr
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	61	subs r8, r8, #1
				62	vmull.s16 q12, d20, d22
				63	vmull.s16 q13, d21, d23
				64	vpadal.s32 q9, q12
				65	vpadal.s32 q14, q13
				66	bgt LOOP_DIM_SEQ
				67
				68	POST_LOOP_DIM_SEQ:
				69	movs r10, r7 @ Loop counter
				70	mov r12, #0
				71	mov r8, #0
				72	ble POST_LOOP_DIM_SEQ_RESIDUAL
				73
				74	LOOP_DIM_SEQ_RESIDUAL:
				75	ldrh r11, [r6], #2
				76	ldrh r9, [r5], #2
				77	smulbb r11, r11, r9
				78	adds r8, r8, r11
				79	adc r12, r12, r11, asr #31
				80	subs r10, #1
				81	bgt LOOP_DIM_SEQ_RESIDUAL
				82
				83	POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift.
				84	vadd.i64 d18, d19
				85	vadd.i64 d28, d29
				86	vadd.i64 d18, d28
				87	vmov.32 d17[0], r8
				88	vmov.32 d17[1], r12
				89	vadd.i64 d17, d18
				90	vshl.s64 d17, d16
				91	vst1.32 d17[0], [r0]! @ Store the output
				92
				93	ldr r8, [sp, #40] @ step_seq2
				94	add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop.
				95
				96	subs r4, #1
				97	bgt LOOP_DIM_CROSS_CORRELATION
				98
				99	pop {r4-r11}
				100	bx lr
				101
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	102	@ TODO(kma): Place this piece of reference code into a C code file.
pbos@webrtc.org	1727dc7	2013-04-09 16:40:28 +0000	[diff] [blame]	103	@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
				104	@ int16_t* seq1,
				105	@ int16_t* seq2,
				106	@ int16_t dim_seq,
				107	@ int16_t dim_cross_correlation,
				108	@ int16_t right_shifts,
				109	@ int16_t step_seq2) {
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	110	@ int i = 0;
				111	@ int j = 0;
				112	@ int inner_loop_len1 = dim_seq >> 3;
				113	@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	114	@
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	115	@ assert(dim_cross_correlation > 0);
				116	@ assert(dim_seq > 0);
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	117	@
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	118	@ for (i = 0; i < dim_cross_correlation; i++) {
				119	@ int16_t *seq1_ptr = seq1;
				120	@ int16_t seq2_ptr = seq2 + (step_seq2 i);
				121	@ int64_t sum = 0;
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	122	@
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	123	@ for (j = inner_loop_len1; j > 0; j -= 1) {
				124	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
				125	@ seq1_ptr++;
				126	@ seq2_ptr++;
				127	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
				128	@ seq1_ptr++;
				129	@ seq2_ptr++;
				130	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
				131	@ seq1_ptr++;
				132	@ seq2_ptr++;
				133	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
				134	@ seq1_ptr++;
				135	@ seq2_ptr++;
				136	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
				137	@ seq1_ptr++;
				138	@ seq2_ptr++;
				139	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
				140	@ seq1_ptr++;
				141	@ seq2_ptr++;
				142	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
				143	@ seq1_ptr++;
				144	@ seq2_ptr++;
				145	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
				146	@ seq1_ptr++;
				147	@ seq2_ptr++;
				148	@ }
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	149	@
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	150	@ // Calculate the rest of the samples.
				151	@ for (j = inner_loop_len2; j > 0; j -= 1) {
				152	@ sum += WEBRTC_SPL_MUL_16_16(seq1_ptr, seq2_ptr);
				153	@ seq1_ptr++;
				154	@ seq2_ptr++;
				155	@ }
kma@webrtc.org	9fc6250	2012-11-17 00:22:46 +0000	[diff] [blame]	156	@
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	157	@ *cross_correlation++ = (int32_t)(sum >> right_shifts);
				158	@ }
				159	@ }