blob: 236d8bc607b03e73988b27c338ec03a00ee3d045 [file] [log] [blame]
/*
* Copyright (C) 2008 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ---- includes ----------------------------------------------------------- */
#include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */
#if ( defined( WIN64 ) || defined( HW_SSE2 ) )
#include "emmintrin.h"
/* disable warning "local variable 'x' used without having been initialized" */
#pragma warning( disable : 4700 )
/** Using half register (64-bit) in SSE2 to calculate dot product.
* This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c.
* Dependencies: input vectors need to be 16-bit aligned
* Return Value: int32 containing resultL of dot product
*/
int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
__m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8;
int16* vec1L = ( int16* )vec1A;
int16* vec2L = ( int16* )vec2A;
int32 resultL = 0;
uint32 alignOffSetL = 0;
/* initialize registers to 0 */
m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 );
m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 );
alignOffSetL = sizeA % 16;
sizeA >>= 4;
if( sizeA )
{
while( sizeA > 0 )
{
m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] );
m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] );
m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] );
m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 );
m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] );
m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] );
m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 );
m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] );
m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] );
m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 );
m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] );
m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 );
m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 );
vec1L += 16;
vec2L += 16;
sizeA--;
}
/* sum up accumulators */
m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 );
m_XMM0 = _mm_srli_epi64( m_XMM0, 32 );
m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
resultL = _mm_cvtsi128_si32( m_XMM7 );
}
/* switch statements produces faster code than loop */
switch( alignOffSetL )
{
case 15:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 14:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 13:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 12:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 11:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 10:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 9:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 8:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 7:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 6:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 5:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 4:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 3:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 2:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 1:
resultL += ( int32 )*vec1L++ * *vec2L++;
}
return resultL;
}
/* ------------------------------------------------------------------------- */
/** Using full register (128-bit) in SSE2 to calculate dot Product.
* Dependencies: 16-bit aligned
* Return Value: int32 containing dot Product
*/
int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
int16* vec1L = ( int16* )vec1A;
int16* vec2L = ( int16* )vec2A;
int32 resultL = 0;
uint32 alignOffSetL = 0;
m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
alignOffSetL = sizeA % 16;
sizeA >>= 4;
if( sizeA )
{
while( sizeA > 0 )
{
m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] );
m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] );
m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] );
m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] );
m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
vec1L += 16;
vec2L += 16;
sizeA--;
}
/* sum up accumulators */
m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 );
resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */
m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */
m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */
m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */
}
switch( alignOffSetL )
{
case 15:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 14:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 13:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 12:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 11:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 10:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 9:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 8:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 7:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 6:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 5:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 4:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 3:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 2:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 1:
resultL += ( int32 )*vec1L++ * *vec2L++;
}
return resultL;
}
/* ------------------------------------------------------------------------- */
/** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version).
* Dependencies: memory does not need to be 16-bit aligned
* Return Value: int32 containing dot product
*/
int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
{
__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
int16* vec1L = ( int16* )vec1A;
int16* vec2L = ( int16* )vec2A;
int32 resultL = 0;
uint32 alignOffSetL = 0;
/* initialize registers to 0 */
m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
alignOffSetL = sizeA % 16;
sizeA >>= 4;
if( sizeA )
{
while( sizeA > 0 )
{
m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] );
m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] );
m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] );
m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] );
m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
vec1L += 16;
vec2L += 16;
sizeA--;
}
/* sum up accumulators */
m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 );
resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */
m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */
m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */
m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */
}
switch( alignOffSetL )
{
case 15:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 14:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 13:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 12:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 11:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 10:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 9:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 8:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 7:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 6:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 5:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 4:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 3:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 2:
resultL += ( int32 )*vec1L++ * *vec2L++;
case 1:
resultL += ( int32 )*vec1L++ * *vec2L++;
}
return resultL;
}
/* ------------------------------------------------------------------------- */
#endif /* HW_SSE2 */