performance in ARM V7 improved 7~8 %.
Review URL: http://webrtc-codereview.appspot.com/114007

git-svn-id: http://webrtc.googlecode.com/svn/trunk@384 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/src/common_audio/signal_processing_library/main/source/complex_fft.c b/src/common_audio/signal_processing_library/main/source/complex_fft.c
index 6e9069a..bcaa076 100644
--- a/src/common_audio/signal_processing_library/main/source/complex_fft.c
+++ b/src/common_audio/signal_processing_library/main/source/complex_fft.c
@@ -98,19 +98,39 @@
                 wr = WebRtcSpl_kSinTable1024[j + 256];
                 wi = -WebRtcSpl_kSinTable1024[j];
 
+#ifdef WEBRTC_ARCH_ARM_V7A
+                WebRtc_Word32 wri;
+                WebRtc_Word32 frfi_r;
+                __asm__("pkhbt %0, %1, %2, lsl #16" : "=r"(wri) :
+                    "r"((WebRtc_Word32)wr), "r"((WebRtc_Word32)wi));
+#endif
+
                 for (i = m; i < n; i += istep)
                 {
                     j = i + l;
 
-                    tr32 = WEBRTC_SPL_RSHIFT_W32((WEBRTC_SPL_MUL_16_16(wr, frfi[2 * j])
-                            - WEBRTC_SPL_MUL_16_16(wi, frfi[2 * j + 1]) + CFFTRND),
-                            15 - CFFTSFT);
+#ifdef WEBRTC_ARCH_ARM_V7A
+                    __asm__("pkhbt %0, %1, %2, lsl #16" : "=r"(frfi_r) :
+                        "r"((WebRtc_Word32)frfi[2*j]), "r"((WebRtc_Word32)frfi[2*j +1]));
+                    __asm__("smlsd %0, %1, %2, %3" : "=r"(tr32) :
+                        "r"(wri), "r"(frfi_r), "r"(CFFTRND));
+                    __asm__("smladx %0, %1, %2, %3" : "=r"(ti32) :
+                        "r"(wri), "r"(frfi_r), "r"(CFFTRND));
+    
+#else
+                    tr32 = WEBRTC_SPL_MUL_16_16(wr, frfi[2 * j])
+                            - WEBRTC_SPL_MUL_16_16(wi, frfi[2 * j + 1]) + CFFTRND;
 
-                    ti32 = WEBRTC_SPL_RSHIFT_W32((WEBRTC_SPL_MUL_16_16(wr, frfi[2 * j + 1])
-                            + WEBRTC_SPL_MUL_16_16(wi, frfi[2 * j]) + CFFTRND), 15 - CFFTSFT);
+                    ti32 = WEBRTC_SPL_MUL_16_16(wr, frfi[2 * j + 1])
+                            + WEBRTC_SPL_MUL_16_16(wi, frfi[2 * j]) + CFFTRND;
+#endif
+
+                    tr32 = WEBRTC_SPL_RSHIFT_W32(tr32, 15 - CFFTSFT);
+                    ti32 = WEBRTC_SPL_RSHIFT_W32(ti32, 15 - CFFTSFT);
 
                     qr32 = ((WebRtc_Word32)frfi[2 * i]) << CFFTSFT;
                     qi32 = ((WebRtc_Word32)frfi[2 * i + 1]) << CFFTSFT;
+
                     frfi[2 * j] = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(
                             (qr32 - tr32 + CFFTRND2), 1 + CFFTSFT);
                     frfi[2 * j + 1] = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(
diff --git a/src/common_audio/signal_processing_library/main/source/complex_ifft.c b/src/common_audio/signal_processing_library/main/source/complex_ifft.c
index fffcc0c..c2e4b4f 100644
--- a/src/common_audio/signal_processing_library/main/source/complex_ifft.c
+++ b/src/common_audio/signal_processing_library/main/source/complex_ifft.c
@@ -110,21 +110,38 @@
                 wr = WebRtcSpl_kSinTable1024[j + 256];
                 wi = WebRtcSpl_kSinTable1024[j];
 
+#ifdef WEBRTC_ARCH_ARM_V7A
+                WebRtc_Word32 wri;
+                WebRtc_Word32 frfi_r;
+                __asm__("pkhbt %0, %1, %2, lsl #16" : "=r"(wri) :
+                    "r"((WebRtc_Word32)wr), "r"((WebRtc_Word32)wi));
+#endif
+
                 for (i = m; i < n; i += istep)
                 {
                     j = i + l;
 
-                    tr32 = WEBRTC_SPL_RSHIFT_W32((WEBRTC_SPL_MUL_16_16_RSFT(wr, frfi[2 * j], 0)
-                            - WEBRTC_SPL_MUL_16_16_RSFT(wi, frfi[2 * j + 1], 0) + CIFFTRND),
-                            15 - CIFFTSFT);
+#ifdef WEBRTC_ARCH_ARM_V7A
+                    __asm__("pkhbt %0, %1, %2, lsl #16" : "=r"(frfi_r) :
+                        "r"((WebRtc_Word32)frfi[2*j]), "r"((WebRtc_Word32)frfi[2*j +1]));
+                    __asm__("smlsd %0, %1, %2, %3" : "=r"(tr32) :
+                        "r"(wri), "r"(frfi_r), "r"(CIFFTRND));
+                    __asm__("smladx %0, %1, %2, %3" : "=r"(ti32) :
+                        "r"(wri), "r"(frfi_r), "r"(CIFFTRND));
+#else
 
-                    ti32 = WEBRTC_SPL_RSHIFT_W32(
-                                    (WEBRTC_SPL_MUL_16_16_RSFT(wr, frfi[2 * j + 1], 0)
-                                            + WEBRTC_SPL_MUL_16_16_RSFT(wi, frfi[2 * j], 0)
-                                            + CIFFTRND), 15 - CIFFTSFT);
+                    tr32 = WEBRTC_SPL_MUL_16_16(wr, frfi[2 * j])
+                            - WEBRTC_SPL_MUL_16_16(wi, frfi[2 * j + 1]) + CIFFTRND;
 
+                    ti32 = WEBRTC_SPL_MUL_16_16(wr, frfi[2 * j + 1])
+                            + WEBRTC_SPL_MUL_16_16(wi, frfi[2 * j]) + CIFFTRND;
+#endif
+                    tr32 = WEBRTC_SPL_RSHIFT_W32(tr32, 15 - CIFFTSFT);
+                    ti32 = WEBRTC_SPL_RSHIFT_W32(ti32, 15 - CIFFTSFT);
+    
                     qr32 = ((WebRtc_Word32)frfi[2 * i]) << CIFFTSFT;
                     qi32 = ((WebRtc_Word32)frfi[2 * i + 1]) << CIFFTSFT;
+    
                     frfi[2 * j] = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32((qr32 - tr32+round2),
                                                                        shift+CIFFTSFT);
                     frfi[2 * j + 1] = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(