common_audio/signal_processing/min_max_operations_neon.S - fp2-dev/platform/external/chromium_org/third_party/webrtc - Gitiles

 @
 @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 @
 @ Use of this source code is governed by a BSD-style license
 @ that can be found in the LICENSE file in the root of the source
 @ tree. An additional intellectual property rights grant can be found
 @ in the file PATENTS.  All contributing project authors may
 @ be found in the AUTHORS file in the root of the source tree.
 @

 @ This file contains some minimum and maximum functions, optimized for
 @ ARM Neon platform. The description header can be found in
 @ signal_processing_library.h
 @
 @ The reference C code is in file min_max_operations.c. Code here is basically
 @ a loop unrolling by 8 with Neon instructions. Bit-exact.

 #include "webrtc/system_wrappers/interface/asm_defines.h"

 GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
 GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
 GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
 GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
 GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
 GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon

 .align  2
 @ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
 DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
   mov r2, #-1                 @ Initialize the return value.
   cmp r0, #0
   beq END_MAX_ABS_VALUE_W16
   cmp r1, #0
   ble END_MAX_ABS_VALUE_W16

   cmp r1, #8
   blt LOOP_MAX_ABS_VALUE_W16

   vmov.i16 q12, #0
   sub r1, #8                  @ Counter for loops

 LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
   vld1.16 {q13}, [r0]!
   subs r1, #8
   vabs.s16 q13, q13           @ Note vabs doesn't change the value of -32768.
   vmax.u16 q12, q13           @ Use u16 so we don't lose the value -32768.
   bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16

   @ Find the maximum value in the Neon registers and move it to r2.
   vmax.u16 d24, d25
   vpmax.u16 d24, d24, d24
   vpmax.u16 d24, d24, d24
   adds r1, #8
   vmov.u16 r2, d24[0]
   beq END_MAX_ABS_VALUE_W16

 LOOP_MAX_ABS_VALUE_W16:
   ldrsh r3, [r0], #2
   eor r12, r3, r3, asr #31    @ eor and then sub, to get absolute value.
   sub r12, r12, r3, asr #31
   cmp r2, r12
   movlt r2, r12
   subs r1, #1
   bne LOOP_MAX_ABS_VALUE_W16

 END_MAX_ABS_VALUE_W16:
   cmp r2, #0x8000             @ Guard against the case for -32768.
   subeq r2, #1
   mov r0, r2
   bx  lr


 @ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
 DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
   cmp r0, #0
   moveq r0, #-1
   beq EXIT                    @ Return -1 for a NULL pointer.
   cmp r1, #0                  @ length
   movle r0, #-1
   ble EXIT                    @ Return -1 if length <= 0.

   vmov.i32 q11, #0
   vmov.i32 q12, #0
   cmp r1, #8
   blt LOOP_MAX_ABS_VALUE_W32

   sub r1, #8                  @ Counter for loops

 LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
   vld1.32 {q13, q14}, [r0]!
   subs r1, #8                 @ Counter for loops
   vabs.s32 q13, q13           @ vabs doesn't change the value of 0x80000000.
   vabs.s32 q14, q14
   vmax.u32 q11, q13           @ Use u32 so we don't lose the value 0x80000000.
   vmax.u32 q12, q14
   bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32

   @ Find the maximum value in the Neon registers and move it to r2.
   vmax.u32 q12, q11
   vmax.u32 d24, d25
   vpmax.u32 d24, d24, d24
   adds r1, #8
   vmov.u32 r2, d24[0]
   beq END_MAX_ABS_VALUE_W32

 LOOP_MAX_ABS_VALUE_W32:
   ldr r3, [r0], #4
   eor r12, r3, r3, asr #31    @ eor and then sub, to get absolute value.
   sub r12, r12, r3, asr #31
   cmp r2, r12
   movcc r2, r12
   subs r1, #1
   bne LOOP_MAX_ABS_VALUE_W32

 END_MAX_ABS_VALUE_W32:
   mvn r0, #0x80000000         @ Guard against the case for 0x80000000.
   cmp r2, r0
   movcc r0, r2

 EXIT:
   bx  lr

 @ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
 DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
   mov r2, #0x8000             @ Initialize the return value.
   cmp r0, #0
   beq END_MAX_VALUE_W16
   cmp r1, #0
   ble END_MAX_VALUE_W16

   vmov.i16 q12, #0x8000
   cmp r1, #8
   blt LOOP_MAX_VALUE_W16

   sub r1, #8                  @ Counter for loops

 LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
   vld1.16 {q13}, [r0]!
   subs r1, #8
   vmax.s16 q12, q13
   bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16

   @ Find the maximum value in the Neon registers and move it to r2.
   vmax.s16 d24, d25
   vpmax.s16 d24, d24, d24
   vpmax.s16 d24, d24, d24
   adds r1, #8
   vmov.u16 r2, d24[0]
   beq END_MAX_VALUE_W16

 LOOP_MAX_VALUE_W16:
   ldrsh r3, [r0], #2
   cmp r2, r3
   movlt r2, r3
   subs r1, #1
   bne LOOP_MAX_VALUE_W16

 END_MAX_VALUE_W16:
   mov r0, r2
   bx  lr

 @ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
 DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
   mov r2, #0x80000000         @ Initialize the return value.
   cmp r0, #0
   beq END_MAX_VALUE_W32
   cmp r1, #0
   ble END_MAX_VALUE_W32

   vmov.i32 q11, #0x80000000
   vmov.i32 q12, #0x80000000
   cmp r1, #8
   blt LOOP_MAX_VALUE_W32

   sub r1, #8                  @ Counter for loops

 LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
   vld1.32 {q13, q14}, [r0]!
   subs r1, #8
   vmax.s32 q11, q13
   vmax.s32 q12, q14
   bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32

   @ Find the maximum value in the Neon registers and move it to r2.
   vmax.s32 q12, q11
   vpmax.s32 d24, d24, d25
   vpmax.s32 d24, d24, d24
   adds r1, #8
   vmov.s32 r2, d24[0]
   beq END_MAX_VALUE_W32

 LOOP_MAX_VALUE_W32:
   ldr r3, [r0], #4
   cmp r2, r3
   movlt r2, r3
   subs r1, #1
   bne LOOP_MAX_VALUE_W32

 END_MAX_VALUE_W32:
   mov r0, r2
   bx  lr

 @ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
 DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
   movw r2, #0x7FFF            @ Initialize the return value.
   cmp r0, #0
   beq END_MIN_VALUE_W16
   cmp r1, #0
   ble END_MIN_VALUE_W16

   vmov.i16 q12, #0x7FFF
   cmp r1, #8
   blt LOOP_MIN_VALUE_W16

   sub r1, #8                  @ Counter for loops

 LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
   vld1.16 {q13}, [r0]!
   subs r1, #8
   vmin.s16 q12, q13
   bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16

   @ Find the maximum value in the Neon registers and move it to r2.
   vmin.s16 d24, d25
   vpmin.s16 d24, d24, d24
   vpmin.s16 d24, d24, d24
   adds r1, #8
   vmov.s16 r2, d24[0]
   sxth  r2, r2
   beq END_MIN_VALUE_W16

 LOOP_MIN_VALUE_W16:
   ldrsh r3, [r0], #2
   cmp r2, r3
   movge r2, r3
   subs r1, #1
   bne LOOP_MIN_VALUE_W16

 END_MIN_VALUE_W16:
   mov r0, r2
   bx  lr

 @ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
 DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
   mov r2, #0x7FFFFFFF         @ Initialize the return value.
   cmp r0, #0
   beq END_MIN_VALUE_W32
   cmp r1, #0
   ble END_MIN_VALUE_W32

   vdup.32 q11, r2
   vdup.32 q12, r2
   cmp r1, #8
   blt LOOP_MIN_VALUE_W32

   sub r1, #8                  @ Counter for loops

 LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
   vld1.32 {q13, q14}, [r0]!
   subs r1, #8
   vmin.s32 q11, q13
   vmin.s32 q12, q14
   bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32

   @ Find the maximum value in the Neon registers and move it to r2.
   vmin.s32 q12, q11
   vpmin.s32 d24, d24, d25
   vpmin.s32 d24, d24, d24
   adds r1, #8
   vmov.s32 r2, d24[0]
   beq END_MIN_VALUE_W32

 LOOP_MIN_VALUE_W32:
   ldr r3, [r0], #4
   cmp r2, r3
   movge r2, r3
   subs r1, #1
   bne LOOP_MIN_VALUE_W32

 END_MIN_VALUE_W32:
   mov r0, r2
   bx  lr
	@
	@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
	@
	@ Use of this source code is governed by a BSD-style license
	@ that can be found in the LICENSE file in the root of the source
	@ tree. An additional intellectual property rights grant can be found
	@ in the file PATENTS. All contributing project authors may
	@ be found in the AUTHORS file in the root of the source tree.
	@

	@ This file contains some minimum and maximum functions, optimized for
	@ ARM Neon platform. The description header can be found in
	@ signal_processing_library.h
	@
	@ The reference C code is in file min_max_operations.c. Code here is basically
	@ a loop unrolling by 8 with Neon instructions. Bit-exact.

	#include "webrtc/system_wrappers/interface/asm_defines.h"

	GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
	GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
	GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
	GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
	GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
	GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon

	.align 2
	@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
	DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
	mov r2, #-1 @ Initialize the return value.
	cmp r0, #0
	beq END_MAX_ABS_VALUE_W16
	cmp r1, #0
	ble END_MAX_ABS_VALUE_W16

	cmp r1, #8
	blt LOOP_MAX_ABS_VALUE_W16

	vmov.i16 q12, #0
	sub r1, #8 @ Counter for loops

	LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
	vld1.16 {q13}, [r0]!
	subs r1, #8
	vabs.s16 q13, q13 @ Note vabs doesn't change the value of -32768.
	vmax.u16 q12, q13 @ Use u16 so we don't lose the value -32768.
	bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16

	@ Find the maximum value in the Neon registers and move it to r2.
	vmax.u16 d24, d25
	vpmax.u16 d24, d24, d24
	vpmax.u16 d24, d24, d24
	adds r1, #8
	vmov.u16 r2, d24[0]
	beq END_MAX_ABS_VALUE_W16

	LOOP_MAX_ABS_VALUE_W16:
	ldrsh r3, [r0], #2
	eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
	sub r12, r12, r3, asr #31
	cmp r2, r12
	movlt r2, r12
	subs r1, #1
	bne LOOP_MAX_ABS_VALUE_W16

	END_MAX_ABS_VALUE_W16:
	cmp r2, #0x8000 @ Guard against the case for -32768.
	subeq r2, #1
	mov r0, r2
	bx lr



	@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
	DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
	cmp r0, #0
	moveq r0, #-1
	beq EXIT @ Return -1 for a NULL pointer.
	cmp r1, #0 @ length
	movle r0, #-1
	ble EXIT @ Return -1 if length <= 0.

	vmov.i32 q11, #0
	vmov.i32 q12, #0
	cmp r1, #8
	blt LOOP_MAX_ABS_VALUE_W32

	sub r1, #8 @ Counter for loops

	LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
	vld1.32 {q13, q14}, [r0]!
	subs r1, #8 @ Counter for loops
	vabs.s32 q13, q13 @ vabs doesn't change the value of 0x80000000.
	vabs.s32 q14, q14
	vmax.u32 q11, q13 @ Use u32 so we don't lose the value 0x80000000.
	vmax.u32 q12, q14
	bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32

	@ Find the maximum value in the Neon registers and move it to r2.
	vmax.u32 q12, q11
	vmax.u32 d24, d25
	vpmax.u32 d24, d24, d24
	adds r1, #8
	vmov.u32 r2, d24[0]
	beq END_MAX_ABS_VALUE_W32

	LOOP_MAX_ABS_VALUE_W32:
	ldr r3, [r0], #4
	eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
	sub r12, r12, r3, asr #31
	cmp r2, r12
	movcc r2, r12
	subs r1, #1
	bne LOOP_MAX_ABS_VALUE_W32

	END_MAX_ABS_VALUE_W32:
	mvn r0, #0x80000000 @ Guard against the case for 0x80000000.
	cmp r2, r0
	movcc r0, r2

	EXIT:
	bx lr

	@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
	DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
	mov r2, #0x8000 @ Initialize the return value.
	cmp r0, #0
	beq END_MAX_VALUE_W16
	cmp r1, #0
	ble END_MAX_VALUE_W16

	vmov.i16 q12, #0x8000
	cmp r1, #8
	blt LOOP_MAX_VALUE_W16

	sub r1, #8 @ Counter for loops

	LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
	vld1.16 {q13}, [r0]!
	subs r1, #8
	vmax.s16 q12, q13
	bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16

	@ Find the maximum value in the Neon registers and move it to r2.
	vmax.s16 d24, d25
	vpmax.s16 d24, d24, d24
	vpmax.s16 d24, d24, d24
	adds r1, #8
	vmov.u16 r2, d24[0]
	beq END_MAX_VALUE_W16

	LOOP_MAX_VALUE_W16:
	ldrsh r3, [r0], #2
	cmp r2, r3
	movlt r2, r3
	subs r1, #1
	bne LOOP_MAX_VALUE_W16

	END_MAX_VALUE_W16:
	mov r0, r2
	bx lr

	@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
	DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
	mov r2, #0x80000000 @ Initialize the return value.
	cmp r0, #0
	beq END_MAX_VALUE_W32
	cmp r1, #0
	ble END_MAX_VALUE_W32

	vmov.i32 q11, #0x80000000
	vmov.i32 q12, #0x80000000
	cmp r1, #8
	blt LOOP_MAX_VALUE_W32

	sub r1, #8 @ Counter for loops

	LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
	vld1.32 {q13, q14}, [r0]!
	subs r1, #8
	vmax.s32 q11, q13
	vmax.s32 q12, q14
	bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32

	@ Find the maximum value in the Neon registers and move it to r2.
	vmax.s32 q12, q11
	vpmax.s32 d24, d24, d25
	vpmax.s32 d24, d24, d24
	adds r1, #8
	vmov.s32 r2, d24[0]
	beq END_MAX_VALUE_W32

	LOOP_MAX_VALUE_W32:
	ldr r3, [r0], #4
	cmp r2, r3
	movlt r2, r3
	subs r1, #1
	bne LOOP_MAX_VALUE_W32

	END_MAX_VALUE_W32:
	mov r0, r2
	bx lr

	@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
	DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
	movw r2, #0x7FFF @ Initialize the return value.
	cmp r0, #0
	beq END_MIN_VALUE_W16
	cmp r1, #0
	ble END_MIN_VALUE_W16

	vmov.i16 q12, #0x7FFF
	cmp r1, #8
	blt LOOP_MIN_VALUE_W16

	sub r1, #8 @ Counter for loops

	LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
	vld1.16 {q13}, [r0]!
	subs r1, #8
	vmin.s16 q12, q13
	bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16

	@ Find the maximum value in the Neon registers and move it to r2.
	vmin.s16 d24, d25
	vpmin.s16 d24, d24, d24
	vpmin.s16 d24, d24, d24
	adds r1, #8
	vmov.s16 r2, d24[0]
	sxth r2, r2
	beq END_MIN_VALUE_W16

	LOOP_MIN_VALUE_W16:
	ldrsh r3, [r0], #2
	cmp r2, r3
	movge r2, r3
	subs r1, #1
	bne LOOP_MIN_VALUE_W16

	END_MIN_VALUE_W16:
	mov r0, r2
	bx lr

	@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
	DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
	mov r2, #0x7FFFFFFF @ Initialize the return value.
	cmp r0, #0
	beq END_MIN_VALUE_W32
	cmp r1, #0
	ble END_MIN_VALUE_W32

	vdup.32 q11, r2
	vdup.32 q12, r2
	cmp r1, #8
	blt LOOP_MIN_VALUE_W32

	sub r1, #8 @ Counter for loops

	LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
	vld1.32 {q13, q14}, [r0]!
	subs r1, #8
	vmin.s32 q11, q13
	vmin.s32 q12, q14
	bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32

	@ Find the maximum value in the Neon registers and move it to r2.
	vmin.s32 q12, q11
	vpmin.s32 d24, d24, d25
	vpmin.s32 d24, d24, d24
	adds r1, #8
	vmov.s32 r2, d24[0]
	beq END_MIN_VALUE_W32

	LOOP_MIN_VALUE_W32:
	ldr r3, [r0], #4
	cmp r2, r3
	movge r2, r3
	subs r1, #1
	bne LOOP_MIN_VALUE_W32

	END_MIN_VALUE_W32:
	mov r0, r2
	bx lr