optimization of resampling by 2 in ARMv7, in spl.
Review URL: http://webrtc-codereview.appspot.com/92015

git-svn-id: http://webrtc.googlecode.com/svn/trunk@327 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/android-webrtc.mk b/android-webrtc.mk
index 735c766..510fd0b 100644
--- a/android-webrtc.mk
+++ b/android-webrtc.mk
@@ -23,13 +23,16 @@
 MY_WEBRTC_COMMON_DEFS += \
     '-DWEBRTC_ARM_INLINE_CALLS' \
     '-DWEBRTC_ARCH_ARM'
-# TODO: test if the code under next two MACROs works with generic GCC compilers
+
+# TODO(kma): test if the code under next two macros works with generic GCC compilers
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
 MY_WEBRTC_COMMON_DEFS += \
-    '-DWEBRTC_ANDROID_ARMV7A_NEON'
-else ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
+    '-DWEBRTC_ARCH_ARM_NEON'
+endif
+
+ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
 MY_WEBRTC_COMMON_DEFS += \
-    '-DWEBRTC_ANDROID_ARMV7A'
+    '-DWEBRTC_ARCH_ARM_V7A'
 endif
 else ifeq ($(TARGET_ARCH),x86)
 MY_WEBRTC_COMMON_DEFS += \
diff --git a/src/common_audio/signal_processing_library/main/source/resample_by_2.c b/src/common_audio/signal_processing_library/main/source/resample_by_2.c
index 7ed4cfd..2083b75 100644
--- a/src/common_audio/signal_processing_library/main/source/resample_by_2.c
+++ b/src/common_audio/signal_processing_library/main/source/resample_by_2.c
@@ -36,43 +36,66 @@
     outptr = out; // output array (of length len/2)
     state = filtState; // filter state array; length = 8
 
+    register WebRtc_Word32 state0 = state[0];
+    register WebRtc_Word32 state1 = state[1];
+    register WebRtc_Word32 state2 = state[2];
+    register WebRtc_Word32 state3 = state[3];
+    register WebRtc_Word32 state4 = state[4];
+    register WebRtc_Word32 state5 = state[5];
+    register WebRtc_Word32 state6 = state[6];
+    register WebRtc_Word32 state7 = state[7];
+
     for (i = (len >> 1); i > 0; i--)
     {
         // lower allpass filter
         in32 = (WebRtc_Word32)(*inptr++) << 10;
-        diff = in32 - state[1];
-        tmp1 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[0], diff, state[0] );
-        state[0] = in32;
-        diff = tmp1 - state[2];
-        tmp2 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[1], diff, state[1] );
-        state[1] = tmp1;
-        diff = tmp2 - state[3];
-        state[3] = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[2], diff, state[2] );
-        state[2] = tmp2;
+        diff = in32 - state1;
+        tmp1 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[0], diff, state0 );
+        state0 = in32;
+        diff = tmp1 - state2;
+        tmp2 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[1], diff, state1 );
+        state1 = tmp1;
+        diff = tmp2 - state3;
+        state3 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[2], diff, state2 );
+        state2 = tmp2;
 
         // upper allpass filter
         in32 = (WebRtc_Word32)(*inptr++) << 10;
-        diff = in32 - state[5];
-        tmp1 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[0], diff, state[4] );
-        state[4] = in32;
-        diff = tmp1 - state[6];
-        tmp2 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[1], diff, state[5] );
-        state[5] = tmp1;
-        diff = tmp2 - state[7];
-        state[7] = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[2], diff, state[6] );
-        state[6] = tmp2;
+        diff = in32 - state5;
+        tmp1 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[0], diff, state4 );
+        state4 = in32;
+        diff = tmp1 - state6;
+        tmp2 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[1], diff, state5 );
+        state5 = tmp1;
+        diff = tmp2 - state7;
+        state7 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[2], diff, state6 );
+        state6 = tmp2;
 
         // add two allpass outputs, divide by two and round
-        out32 = (state[3] + state[7] + 1024) >> 11;
+        out32 = (state3 + state7 + 1024) >> 11;
 
         // limit amplitude to prevent wrap-around, and write to output array
+#ifdef WEBRTC_ARCH_ARM_V7A
+        __asm__("ssat %r0, #16, %r1" : "=r"(*outptr) : "r"(out32));
+        outptr++;
+#else
         if (out32 > 32767)
             *outptr++ = 32767;
         else if (out32 < -32768)
             *outptr++ = -32768;
         else
             *outptr++ = (WebRtc_Word16)out32;
+#endif
     }
+
+    state[0]=state0;
+    state[1]=state1;
+    state[2]=state2;
+    state[3]=state3;
+    state[4]=state4;
+    state[5]=state5;
+    state[6]=state6;
+    state[7]=state7;
 }
 
 void WebRtcSpl_UpsampleBy2(const WebRtc_Word16* in, WebRtc_Word16 len, WebRtc_Word16* out,
@@ -89,47 +112,75 @@
     outptr = out; // output array (of length len*2)
     state = filtState; // filter state array; length = 8
 
+    register WebRtc_Word32 state0 = state[0];
+    register WebRtc_Word32 state1 = state[1];
+    register WebRtc_Word32 state2 = state[2];
+    register WebRtc_Word32 state3 = state[3];
+    register WebRtc_Word32 state4 = state[4];
+    register WebRtc_Word32 state5 = state[5];
+    register WebRtc_Word32 state6 = state[6];
+    register WebRtc_Word32 state7 = state[7];
+
     for (i = len; i > 0; i--)
     {
         // lower allpass filter
         in32 = (WebRtc_Word32)(*inptr++) << 10;
-        diff = in32 - state[1];
-        tmp1 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[0], diff, state[0] );
-        state[0] = in32;
-        diff = tmp1 - state[2];
-        tmp2 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[1], diff, state[1] );
-        state[1] = tmp1;
-        diff = tmp2 - state[3];
-        state[3] = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[2], diff, state[2] );
-        state[2] = tmp2;
+        diff = in32 - state1;
+        tmp1 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[0], diff, state0 );
+        state0 = in32;
+        diff = tmp1 - state2;
+        tmp2 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[1], diff, state1 );
+        state1 = tmp1;
+        diff = tmp2 - state3;
+        state3 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass1[2], diff, state2 );
+        state2 = tmp2;
 
         // round; limit amplitude to prevent wrap-around; write to output array
-        out32 = (state[3] + 512) >> 10;
+        out32 = (state3 + 512) >> 10;
+#ifdef WEBRTC_ARCH_ARM_V7A
+        __asm__("ssat %r0, #16, %r1":"=r"(*outptr): "r"(out32));
+        outptr++;
+#else
         if (out32 > 32767)
             *outptr++ = 32767;
         else if (out32 < -32768)
             *outptr++ = -32768;
         else
             *outptr++ = (WebRtc_Word16)out32;
+#endif
 
         // upper allpass filter
-        diff = in32 - state[5];
-        tmp1 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[0], diff, state[4] );
-        state[4] = in32;
-        diff = tmp1 - state[6];
-        tmp2 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[1], diff, state[5] );
-        state[5] = tmp1;
-        diff = tmp2 - state[7];
-        state[7] = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[2], diff, state[6] );
-        state[6] = tmp2;
+        diff = in32 - state5;
+        tmp1 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[0], diff, state4 );
+        state4 = in32;
+        diff = tmp1 - state6;
+        tmp2 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[1], diff, state5 );
+        state5 = tmp1;
+        diff = tmp2 - state7;
+        state7 = WEBRTC_SPL_SCALEDIFF32( kResampleAllpass2[2], diff, state6 );
+        state6 = tmp2;
 
         // round; limit amplitude to prevent wrap-around; write to output array
-        out32 = (state[7] + 512) >> 10;
+        out32 = (state7 + 512) >> 10;
+#ifdef WEBRTC_ARCH_ARM_V7A
+        __asm__("ssat %r0, #16, %r1":"=r"(*outptr): "r"(out32));
+        outptr++;
+#else
         if (out32 > 32767)
             *outptr++ = 32767;
         else if (out32 < -32768)
             *outptr++ = -32768;
         else
             *outptr++ = (WebRtc_Word16)out32;
+#endif
     }
+    state[0]=state0;
+    state[1]=state1;
+    state[2]=state2;
+    state[3]=state3;
+    state[4]=state4;
+    state[5]=state5;
+    state[6]=state6;
+    state[7]=state7;
+
 }