Cisco optimization for x86 & fixed point 1. Only for fixed point on x86 platform (32bit and 64bit, uses SIMD intrinsics up to SSE4.2) 2. Use "configure --enable-fixed-point --enable-intrinsics" to enable optimization, default is disabled. 3. Official test cases are verified and passed. Signed-off-by: Timothy B. Terriberry <tterribe@xiph.org>

commit: c95c9a048f3283afb2e412b10085d4f7c19e1412 [log] [tgz]
author: xiangmingzhu <xiangzhu@cisco.com> Wed Apr 30 15:48:07 2014 +0800
committer: Jean-Marc Valin <jmvalin@jmvalin.ca> Fri Oct 03 21:16:00 2014 -0400
tree: ed8873af6559d7a98922e0fed85be47c826ef521
parent: 80460334b77d70e665a390503cd8992cdad06c10 [diff]
diff --git a/celt/bands.c b/celt/bands.c
index 30a5894..c643b09 100644
--- a/celt/bands.c
+++ b/celt/bands.c

@@ -164,7 +164,7 @@
       for (i=0;i<end;i++)
       {
          opus_val32 sum;
-         sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
+         sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
          bandE[i+c*m->nbEBands] = celt_sqrt(sum);
          /*printf ("%f ", bandE[i+c*m->nbEBands]);*/
       }
@@ -266,7 +266,7 @@
 /* This prevents energy collapse for transients with multiple short MDCTs */
 void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
       int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
-      const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed)
+      const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed, int arch)
 {
    int c, i, j, k;
    for (i=start;i<end;i++)
@@ -355,7 +355,7 @@
          }
          /* We just added some energy, so we need to renormalise */
          if (renormalize)
-            renormalise_vector(X, N0<<LM, Q15ONE);
+            renormalise_vector(X, N0<<LM, Q15ONE, arch);
       } while (++c<C);
    }
 }
@@ -656,6 +656,7 @@
    opus_int32 remaining_bits;
    const celt_ener *bandE;
    opus_uint32 seed;
+   int arch;
 };
 
 struct split_ctx {
@@ -707,7 +708,7 @@
          side and mid. With just that parameter, we can re-scale both
          mid and side because we know that 1) they have unit norm and
          2) they are orthogonal. */
-      itheta = stereo_itheta(X, Y, stereo, N);
+      itheta = stereo_itheta(X, Y, stereo, N, ctx->arch);
    }
    tell = ec_tell_frac(ec);
    if (qn!=1)
@@ -1055,7 +1056,7 @@
                   }
                   cm = fill;
                }
-               renormalise_vector(X, N, gain);
+               renormalise_vector(X, N, gain, ctx->arch);
             }
          }
       }
@@ -1360,9 +1361,11 @@
 
 
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
-      celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
-      int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
-      opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int LM, int codedBands, opus_uint32 *seed)
+      celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
+      const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
+      int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
+      opus_int32 balance, ec_ctx *ec, int LM, int codedBands,
+      opus_uint32 *seed, int arch)
 {
    int i;
    opus_int32 remaining_bits;
@@ -1404,6 +1407,7 @@
    ctx.m = m;
    ctx.seed = *seed;
    ctx.spread = spread;
+   ctx.arch = arch;
    for (i=start;i<end;i++)
    {
       opus_int32 tell;

diff --git a/celt/bands.h b/celt/bands.h
index 69901b1..e8bef4b 100644
--- a/celt/bands.h
+++ b/celt/bands.h

@@ -98,15 +98,20 @@
  * @param LM log2() of the number of 2.5 subframes in the frame
  * @param codedBands Last band to receive bits + 1
  * @param seed Random generator seed
+ * @param arch Run-time architecture (see opus_select_arch())
  */
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
-      celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
-      int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
-      opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);
+      celt_norm * X, celt_norm * Y, unsigned char *collapse_masks,
+      const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
+      int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
+      opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed,
+      int arch);
 
-void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
-      int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
-      const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed);
+void anti_collapse(const CELTMode *m, celt_norm *X_,
+      unsigned char *collapse_masks, int LM, int C, int size, int start,
+      int end, const opus_val16 *logE, const opus_val16 *prev1logE,
+      const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed,
+      int arch);
 
 opus_uint32 celt_lcg_rand(opus_uint32 seed);
 

diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 8af96b7..4304a3e 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c

@@ -499,7 +499,7 @@
                seed = celt_lcg_rand(seed);
                X[boffs+j] = (celt_norm)((opus_int32)seed>>20);
             }
-            renormalise_vector(X+boffs, blen, Q15ONE);
+            renormalise_vector(X+boffs, blen, Q15ONE, st->arch);
          }
       }
       st->rng = seed;
@@ -583,7 +583,7 @@
             }
             /* Compute the excitation for exc_length samples before the loss. */
             celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
-                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem);
+                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch);
          }
 
          /* Check if the waveform is decaying, and if so how fast.
@@ -650,7 +650,7 @@
                the signal domain. */
             celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
                   buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
-                  lpc_mem);
+                  lpc_mem, st->arch);
          }
 
          /* Check if the synthesis energy is higher than expected, which can
@@ -982,7 +982,7 @@
 
    quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
          NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
-         len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng);
+         len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng, st->arch);
 
    if (anti_collapse_rsv > 0)
    {
@@ -994,7 +994,7 @@
 
    if (anti_collapse_on)
       anti_collapse(mode, X, collapse_masks, LM, C, N,
-            start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
+            start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch);
 
    if (silence)
    {

diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index 7387ad5..6e2827f 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c

@@ -751,7 +751,7 @@
 static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
       const opus_val16 *bandLogE, int end, int LM, int C, int N0,
       AnalysisInfo *analysis, opus_val16 *stereo_saving, opus_val16 tf_estimate,
-      int intensity, opus_val16 surround_trim)
+      int intensity, opus_val16 surround_trim, int arch)
 {
    int i;
    opus_val32 diff=0;
@@ -767,7 +767,8 @@
       for (i=0;i<8;i++)
       {
          opus_val32 partial;
-         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
+         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
+               (m->eBands[i+1]-m->eBands[i])<<LM, arch);
          sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
       }
       sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
@@ -776,7 +777,8 @@
       for (i=8;i<intensity;i++)
       {
          opus_val32 partial;
-         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
+         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
+               (m->eBands[i+1]-m->eBands[i])<<LM, arch);
          minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
       }
       minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
@@ -1097,7 +1099,7 @@
       pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
 
       gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
-            N, &pitch_index, st->prefilter_period, st->prefilter_gain);
+            N, &pitch_index, st->prefilter_period, st->prefilter_gain, st->arch);
       if (pitch_index > COMBFILTER_MAXPERIOD-2)
          pitch_index = COMBFILTER_MAXPERIOD-2;
       gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1);
@@ -1887,7 +1889,8 @@
          alloc_trim = 5;
       else
          alloc_trim = alloc_trim_analysis(mode, X, bandLogE,
-            end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, st->intensity, surround_trim);
+            end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate,
+            st->intensity, surround_trim, st->arch);
       ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
       tell = ec_tell_frac(enc);
    }
@@ -2022,8 +2025,9 @@
    /* Residual quantisation */
    ALLOC(collapse_masks, C*nbEBands, unsigned char);
    quant_all_bands(1, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
-         bandE, pulses, shortBlocks, st->spread_decision, dual_stereo, st->intensity, tf_res,
-         nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, balance, enc, LM, codedBands, &st->rng);
+         bandE, pulses, shortBlocks, st->spread_decision,
+         dual_stereo, st->intensity, tf_res, nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv,
+         balance, enc, LM, codedBands, &st->rng, st->arch);
 
    if (anti_collapse_rsv > 0)
    {

diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index fa29d62..4856648 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c

@@ -88,12 +88,15 @@
 #endif
 }
 
-void celt_fir(const opus_val16 *_x,
+
+void celt_fir_c(
+         const opus_val16 *_x,
          const opus_val16 *num,
          opus_val16 *_y,
          int N,
          int ord,
-         opus_val16 *mem)
+         opus_val16 *mem,
+         int arch)
 {
    int i,j;
    VARDECL(opus_val16, rnum);
@@ -124,7 +127,7 @@
    for (i=0;i<N-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-      xcorr_kernel(rnum, x+i, sum, ord);
+      xcorr_kernel(rnum, x+i, sum, ord, arch);
       _y[i  ] = SATURATE16(ADD32(EXTEND32(_x[i  ]), PSHR32(sum[0], SIG_SHIFT)));
       _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
       _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
@@ -146,7 +149,8 @@
          opus_val32 *_y,
          int N,
          int ord,
-         opus_val16 *mem)
+         opus_val16 *mem,
+         int arch)
 {
 #ifdef SMALL_FOOTPRINT
    int i,j;
@@ -187,7 +191,7 @@
       sum[1]=_x[i+1];
       sum[2]=_x[i+2];
       sum[3]=_x[i+3];
-      xcorr_kernel(rden, y+i, sum, ord);
+      xcorr_kernel(rden, y+i, sum, ord, arch);
 
       /* Patch up the result to compensate for the fact that this is an IIR */
       y[i+ord  ] = -ROUND16(sum[0],SIG_SHIFT);

diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index dc2a0a3..dc8967f 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h

@@ -29,24 +29,37 @@
 #define PLC_H
 
 #include "arch.h"
+#include "cpu_support.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/celt_lpc_sse.h"
+#endif
 
 #define LPC_ORDER 24
 
 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
 
-void celt_fir(const opus_val16 *x,
+void celt_fir_c(
+         const opus_val16 *x,
          const opus_val16 *num,
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem);
+         opus_val16 *mem,
+         int arch);
+
+#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define celt_fir(x, num, y, N, ord, mem, arch) \
+    (celt_fir_c(x, num, y, N, ord, mem, arch))
+#endif
 
 void celt_iir(const opus_val32 *x,
          const opus_val16 *den,
          opus_val32 *y,
          int N,
          int ord,
-         opus_val16 *mem);
+         opus_val16 *mem,
+         int arch);
 
 int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
          const opus_val16 *window, int overlap, int lag, int n, int arch);

diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index d68dbe6..71efff1 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h

@@ -42,6 +42,18 @@
  */
 #define OPUS_ARCHMASK 3
 
+#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
+
+#include "x86/x86cpu.h"
+/* We currently support 3 x86 variants:
+ * arch[0] -> non-sse
+ * arch[1] -> sse2
+ * arch[2] -> sse4.1
+ * arch[3] -> NULL
+ */
+#define OPUS_ARCHMASK 3
+int opus_select_arch(void);
+
 #else
 #define OPUS_ARCHMASK 0
 
@@ -50,5 +62,4 @@
   return 0;
 }
 #endif
-
 #endif

diff --git a/celt/entenc.c b/celt/entenc.c
index 271e4d3..f1750d2 100644
--- a/celt/entenc.c
+++ b/celt/entenc.c

@@ -98,7 +98,7 @@
   else _this->ext++;
 }
 
-static void ec_enc_normalize(ec_enc *_this){
+static OPUS_INLINE void ec_enc_normalize(ec_enc *_this){
   /*If the range is too small, output some bits and rescale it.*/
   while(_this->rng<=EC_CODE_BOT){
     ec_enc_carry_out(_this,(int)(_this->val>>EC_CODE_SHIFT));

diff --git a/celt/mips/vq_mipsr1.h b/celt/mips/vq_mipsr1.h
index 3cea070..0affae0 100644
--- a/celt/mips/vq_mipsr1.h
+++ b/celt/mips/vq_mipsr1.h

@@ -73,7 +73,11 @@
 }
 
 #define OVERRIDE_renormalise_vector
-void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
+
+#define renormalise_vector(X, N, gain, arch) \
+ ((void)(arch), renormalize_vector_mips(x, N, gain))
+
+void renormalise_vector_mips(celt_norm *X, int N, opus_val16 gain)
 {
    int i;
 #ifdef FIXED_POINT

diff --git a/celt/pitch.c b/celt/pitch.c
index 2f0b14b..154c848 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c

@@ -250,7 +250,8 @@
 #else
 void
 #endif
-celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch, int arch)
 {
    int i;
    /*The EDSP version requires that max_pitch is at least 1, and that _x is
@@ -264,7 +265,7 @@
    for (i=0;i<max_pitch-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-      xcorr_kernel(_x, _y+i, sum, len);
+      xcorr_kernel(_x, _y+i, sum, len, arch);
       xcorr[i]=sum[0];
       xcorr[i+1]=sum[1];
       xcorr[i+2]=sum[2];
@@ -280,7 +281,7 @@
    for (;i<max_pitch;i++)
    {
       opus_val32 sum;
-      sum = celt_inner_prod(_x, _y+i, len);
+      sum = celt_inner_prod(_x, _y+i, len, arch);
       xcorr[i] = sum;
 #ifdef FIXED_POINT
       maxcorr = MAX32(maxcorr, sum);
@@ -369,7 +370,7 @@
       for (j=0;j<len>>1;j++)
          sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
 #else
-      sum = celt_inner_prod(x_lp, y+i, len>>1);
+      sum = celt_inner_prod_c(x_lp, y+i, len>>1);
 #endif
       xcorr[i] = MAX32(-1, sum);
 #ifdef FIXED_POINT
@@ -405,7 +406,7 @@
 
 static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
 opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
-      int N, int *T0_, int prev_period, opus_val16 prev_gain)
+      int N, int *T0_, int prev_period, opus_val16 prev_gain, int arch)
 {
    int k, i, T, T0;
    opus_val16 g, g0;
@@ -517,7 +518,7 @@
       pg = SHR32(frac_div32(best_xy,best_yy+1),16);
 
    for (k=0;k<3;k++)
-      xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
+      xcorr[k] = celt_inner_prod(x, x-(T+k-1), N, arch);
    if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
       offset = 1;
    else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))

diff --git a/celt/pitch.h b/celt/pitch.h
index b199976..e692c59 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h

@@ -37,7 +37,8 @@
 #include "modes.h"
 #include "cpu_support.h"
 
-#if defined(__SSE__) && !defined(FIXED_POINT)
+#if defined(__SSE__) && !defined(FIXED_POINT) \
+ || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
 #include "x86/pitch_sse.h"
 #endif
 
@@ -56,12 +57,13 @@
                   int len, int max_pitch, int *pitch, int arch);
 
 opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
-      int N, int *T0, int prev_period, opus_val16 prev_gain);
+      int N, int *T0, int prev_period, opus_val16 prev_gain, int arch);
+
 
 /* OPT: This is the kernel you really want to optimize. It gets used a lot
    by the prefilter and by the PLC. */
 #ifndef OVERRIDE_XCORR_KERNEL
-static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
+static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
 {
    int j;
    opus_val16 y_0, y_1, y_2, y_3;
@@ -126,8 +128,15 @@
       sum[3] = MAC16_16(sum[3],tmp,y_1);
    }
 }
+
+#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)(arch),xcorr_kernel_c(x, y, sum, len))
+#endif
+
 #endif /* OVERRIDE_XCORR_KERNEL */
 
+
 #ifndef OVERRIDE_DUAL_INNER_PROD
 static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
       int N, opus_val32 *xy1, opus_val32 *xy2)
@@ -145,9 +154,10 @@
 }
 #endif
 
-#ifndef OVERRIDE_CELT_INNER_PROD
-static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
-      int N)
+/*We make sure a C version is always available for cases where the overhead of
+  vectorization and passing around an arch flag aren't worth it.*/
+static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x,
+      const opus_val16 *y, int N)
 {
    int i;
    opus_val32 xy=0;
@@ -155,6 +165,10 @@
       xy = MAC16_16(xy, x[i], y[i]);
    return xy;
 }
+
+#if !defined(OVERRIDE_CELT_INNER_PROD)
+# define celt_inner_prod(x, y, N, arch) \
+    ((void)(arch),celt_inner_prod_c(x, y, N))
 #endif
 
 #ifdef FIXED_POINT
@@ -163,11 +177,11 @@
 void
 #endif
 celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
-      opus_val32 *xcorr, int len, int max_pitch);
+      opus_val32 *xcorr, int len, int max_pitch, int arch);
 
 #if !defined(OVERRIDE_PITCH_XCORR)
 /*Is run-time CPU detection enabled on this platform?*/
-# if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
 extern
 #  if defined(FIXED_POINT)
 opus_val32
@@ -179,10 +193,10 @@
 
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
+        xcorr, len, max_pitch, arch))
 # else
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-  ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))
+  ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch, arch))
 # endif
 #endif
 

diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index 4bb780e..5060f7f 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c

@@ -36,6 +36,8 @@
 
 #define CELT_C
 
+#include <stdio.h>
+#include <math.h>
 #include "mathops.c"
 #include "entenc.c"
 #include "entdec.c"
@@ -45,8 +47,16 @@
 #include "laplace.c"
 #include "vq.c"
 #include "cwrs.c"
-#include <stdio.h>
-#include <math.h>
+#include "pitch.c"
+#include "celt_lpc.c"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
+#include "x86/pitch_sse.c"
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/celt_lpc_sse.c"
+#endif
+#include "x86/x86_celt_map.c"
+#endif
 
 #ifdef FIXED_POINT
 #define WORD "%d"

diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index ce5f096..4dce1af 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c

@@ -44,7 +44,18 @@
 #include "entdec.c"
 #include "mathops.c"
 #include "bands.h"
+#include "pitch.c"
+#include "celt_lpc.c"
 #include <math.h>
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
+#include "x86/pitch_sse.c"
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/celt_lpc_sse.c"
+#endif
+#include "x86/x86_celt_map.c"
+#endif
+
 #define MAX_SIZE 100
 
 int ret=0;

diff --git a/celt/vq.c b/celt/vq.c
index b047b22..0c58cdd 100644
--- a/celt/vq.c
+++ b/celt/vq.c

@@ -350,7 +350,7 @@
 }
 
 #ifndef OVERRIDE_renormalise_vector
-void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
+void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
 {
    int i;
 #ifdef FIXED_POINT
@@ -360,7 +360,7 @@
    opus_val16 g;
    opus_val32 t;
    celt_norm *xptr;
-   E = EPSILON + celt_inner_prod(X, X, N);
+   E = EPSILON + celt_inner_prod(X, X, N, arch);
 #ifdef FIXED_POINT
    k = celt_ilog2(E)>>1;
 #endif
@@ -377,7 +377,7 @@
 }
 #endif /* OVERRIDE_renormalise_vector */
 
-int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N)
+int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch)
 {
    int i;
    int itheta;
@@ -396,8 +396,8 @@
          Eside = MAC16_16(Eside, s, s);
       }
    } else {
-      Emid += celt_inner_prod(X, X, N);
-      Eside += celt_inner_prod(Y, Y, N);
+      Emid += celt_inner_prod(X, X, N, arch);
+      Eside += celt_inner_prod(Y, Y, N, arch);
    }
    mid = celt_sqrt(Emid);
    side = celt_sqrt(Eside);

diff --git a/celt/vq.h b/celt/vq.h
index 84115cb..f895820 100644
--- a/celt/vq.h
+++ b/celt/vq.h

@@ -63,8 +63,8 @@
 unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
       ec_dec *dec, opus_val16 gain);
 
-void renormalise_vector(celt_norm *X, int N, opus_val16 gain);
+void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch);
 
-int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N);
+int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch);
 
 #endif /* VQ_H */

diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c
new file mode 100644
index 0000000..9fb9779
--- /dev/null
+++ b/celt/x86/celt_lpc_sse.c

@@ -0,0 +1,128 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+#include "x86cpu.h"
+
+void celt_fir_sse4_1(const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *_y,
+         int N,
+         int ord,
+         opus_val16 *mem,
+         int arch)
+{
+    int i,j;
+    VARDECL(opus_val16, rnum);
+    VARDECL(opus_val16, x);
+
+    __m128i vecNoA;
+    opus_int32 noA ;
+    SAVE_STACK;
+
+   ALLOC(rnum, ord, opus_val16);
+   ALLOC(x, N+ord, opus_val16);
+   for(i=0;i<ord;i++)
+      rnum[i] = num[ord-i-1];
+   for(i=0;i<ord;i++)
+      x[i] = mem[ord-i-1];
+
+   for (i=0;i<N-7;i+=8)
+   {
+       x[i+ord  ]=_x[i  ];
+       x[i+ord+1]=_x[i+1];
+       x[i+ord+2]=_x[i+2];
+       x[i+ord+3]=_x[i+3];
+       x[i+ord+4]=_x[i+4];
+       x[i+ord+5]=_x[i+5];
+       x[i+ord+6]=_x[i+6];
+       x[i+ord+7]=_x[i+7];
+   }
+
+   for (;i<N-3;i+=4)
+   {
+       x[i+ord  ]=_x[i  ];
+       x[i+ord+1]=_x[i+1];
+       x[i+ord+2]=_x[i+2];
+       x[i+ord+3]=_x[i+3];
+   }
+
+   for (;i<N;i++)
+         x[i+ord]=_x[i];
+
+   for(i=0;i<ord;i++)
+      mem[i] = _x[N-i-1];
+#ifdef SMALL_FOOTPRINT
+   for (i=0;i<N;i++)
+   {
+      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+      for (j=0;j<ord;j++)
+      {
+         sum = MAC16_16(sum,rnum[j],x[i+j]);
+      }
+      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+   }
+#else
+   noA = EXTEND32(1) << SIG_SHIFT >> 1;
+   vecNoA = _mm_set_epi32(noA, noA, noA, noA);
+
+   for (i=0;i<N-3;i+=4)
+   {
+      opus_val32 sums[4] = {0};
+      __m128i vecSum, vecX;
+
+      xcorr_kernel(rnum, x+i, sums, ord, arch);
+
+      vecSum = _mm_loadu_si128((__m128i *)sums);
+      vecSum = _mm_add_epi32(vecSum, vecNoA);
+      vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
+      vecX = OP_CVTEPI16_EPI32_M64(_x + i);
+      vecSum = _mm_add_epi32(vecSum, vecX);
+      vecSum = _mm_packs_epi32(vecSum, vecSum);
+      _mm_storel_epi64((__m128i *)(_y + i), vecSum);
+   }
+   for (;i<N;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<ord;j++)
+         sum = MAC16_16(sum, rnum[j], x[i + j]);
+      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+   }
+
+#endif
+   RESTORE_STACK;
+}

diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
new file mode 100644
index 0000000..f111420
--- /dev/null
+++ b/celt/x86/celt_lpc_sse.h

@@ -0,0 +1,58 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_LPC_SSE_H
+#define CELT_LPC_SSE_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+void celt_fir_sse4_1(
+         const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         int ord,
+         opus_val16 *mem,
+         int arch);
+
+extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         int ord,
+         opus_val16 *mem,
+         int arch);
+
+#  define celt_fir(x, num, y, N, ord, mem, arch) \
+    ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch))
+
+#endif
+#endif

diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c
new file mode 100644
index 0000000..e3bc6d7
--- /dev/null
+++ b/celt/x86/pitch_sse.c

@@ -0,0 +1,251 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include <smmintrin.h>
+#include "x86cpu.h"
+
+opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+    opus_int  i, dataSize16;
+    opus_int32 sum;
+    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+    __m128i inVec1_3210, inVec2_3210;
+
+    sum = 0;
+    dataSize16 = N & ~15;
+
+    acc1 = _mm_setzero_si128();
+    acc2 = _mm_setzero_si128();
+
+    for (i=0;i<dataSize16;i+=16) {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+    }
+
+    acc1 = _mm_add_epi32(acc1, acc2);
+
+    if (N - i >= 8)
+    {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        i += 8;
+    }
+
+    if (N - i >= 4)
+    {
+        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
+        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
+
+        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_3210);
+        i += 4;
+    }
+
+    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
+    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
+
+    sum += _mm_cvtsi128_si32(acc1);
+
+    for (;i<N;i++)
+    {
+        sum = silk_SMLABB(sum, x[i], y[i]);
+    }
+
+    return sum;
+}
+
+void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
+{
+    int j;
+
+    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
+    __m128i vecY0, vecY1, vecY2, vecY3;
+    __m128i sum0, sum1, sum2, sum3, vecSum;
+    __m128i initSum;
+
+    celt_assert(len >= 3);
+
+    sum0 = _mm_setzero_si128();
+    sum1 = _mm_setzero_si128();
+    sum2 = _mm_setzero_si128();
+    sum3 = _mm_setzero_si128();
+
+    for (j=0;j<(len-7);j+=8)
+    {
+        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
+        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
+        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
+        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
+        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
+
+        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
+        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
+        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
+        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
+    }
+
+    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
+    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
+
+    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
+    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
+
+    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
+    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
+
+    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
+    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
+
+    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
+          _mm_unpacklo_epi32(sum2, sum3));
+
+    for (;j<(len-3);j+=4)
+    {
+        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
+        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
+        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+        sum2 = _mm_mullo_epi32(vecX2, vecY2);
+        sum3 = _mm_mullo_epi32(vecX3, vecY3);
+
+        sum0 = _mm_add_epi32(sum0, sum1);
+        sum2 = _mm_add_epi32(sum2, sum3);
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum2);
+    }
+
+    for (;j<len;j++)
+    {
+        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        vecSum = _mm_add_epi32(vecSum, sum0);
+    }
+
+    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
+    initSum = _mm_add_epi32(initSum, vecSum);
+    _mm_storeu_si128((__m128i *)sum, initSum);
+}
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+    opus_int  i, dataSize16;
+    opus_int32 sum;
+
+    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+
+    sum = 0;
+    dataSize16 = N & ~15;
+
+    acc1 = _mm_setzero_si128();
+    acc2 = _mm_setzero_si128();
+
+    for (i=0;i<dataSize16;i+=16)
+    {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+    }
+
+    acc1 = _mm_add_epi32( acc1, acc2 );
+
+    if (N - i >= 8)
+    {
+        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+        i += 8;
+    }
+
+    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
+    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
+    sum += _mm_cvtsi128_si32(acc1);
+
+    for (;i<N;i++) {
+        sum = silk_SMLABB(sum, x[i], y[i]);
+    }
+
+    return sum;
+}
+#endif

diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index 58f8324..837e8ae 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h

@@ -1,4 +1,5 @@
-/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
+/* Copyright (c) 2013 Jean-Marc Valin and John Ridges
+   Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
 /**
    @file pitch_sse.h
    @brief Pitch analysis
@@ -32,11 +33,55 @@
 #ifndef PITCH_SSE_H
 #define PITCH_SSE_H
 
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+void xcorr_kernel_sse4_1(
+                    const opus_int16 *x,
+                    const opus_int16 *y,
+                    opus_val32       sum[4],
+                    int              len );
+
+extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+                    const opus_int16 *x,
+                    const opus_int16 *y,
+                    opus_val32       sum[4],
+                    int              len );
+
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
+
+opus_val32 celt_inner_prod_sse4_1(
+    const opus_int16 *x,
+    const opus_int16 *y,
+    int               N);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+opus_val32 celt_inner_prod_sse2(
+    const opus_int16 *x,
+    const opus_int16 *y,
+    int               N);
+#endif
+
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+                    const opus_int16 *x,
+                    const opus_int16 *y,
+                    int               N);
+
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+    ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+#else
+
 #include <xmmintrin.h>
 #include "arch.h"
 
 #define OVERRIDE_XCORR_KERNEL
-static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
+static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
 {
    int j;
    __m128 xsum1, xsum2;
@@ -71,6 +116,9 @@
    _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
 }
 
+#define xcorr_kernel(_x, _y, _z, len, arch) \
+    ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len))
+
 #define OVERRIDE_DUAL_INNER_PROD
 static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
       int N, opus_val32 *xy1, opus_val32 *xy2)
@@ -102,7 +150,7 @@
 }
 
 #define OVERRIDE_CELT_INNER_PROD
-static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
+static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
       int N)
 {
    int i;
@@ -127,6 +175,9 @@
    return xy;
 }
 
+#  define celt_inner_prod(_x, _y, len, arch) \
+    ((void)(arch),celt_inner_prod_sse(_x, _y, len))
+
 #define OVERRIDE_COMB_FILTER_CONST
 static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
       opus_val16 g10, opus_val16 g11, opus_val16 g12)
@@ -180,3 +231,4 @@
 }
 
 #endif
+#endif

diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
new file mode 100644
index 0000000..83410db
--- /dev/null
+++ b/celt/x86/x86_celt_map.c

@@ -0,0 +1,84 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#include "x86/x86cpu.h"
+#include "celt_lpc.h"
+#include "pitch.h"
+#include "pitch_sse.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+# if defined(FIXED_POINT)
+
+void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16       *y,
+         int              N,
+         int              ord,
+         opus_val16       *mem,
+         int              arch
+) = {
+  celt_fir_c,                /* non-sse */
+  celt_fir_c,
+  MAY_HAVE_SSE4_1(celt_fir), /* sse4.1  */
+  NULL
+};
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         opus_val32       sum[4],
+         int              len
+) = {
+  xcorr_kernel_c,                /* non-sse */
+  xcorr_kernel_c,
+  MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1  */
+  NULL
+};
+
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         int              N
+) = {
+  celt_inner_prod_c,                /* non-sse */
+  MAY_HAVE_SSE2(celt_inner_prod),
+  MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1  */
+  NULL
+};
+
+# else
+#  error "Floating-point implementation is not supported by x86 RTCD yet." \
+ "Reconfigure with --disable-rtcd or send patches."
+# endif
+
+#endif

diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
new file mode 100644
index 0000000..c82a4b7
--- /dev/null
+++ b/celt/x86/x86cpu.c

@@ -0,0 +1,111 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "cpu_support.h"
+#include "macros.h"
+#include "main.h"
+#include "pitch.h"
+#include "x86cpu.h"
+
+#if defined(_MSC_VER)
+
+#include <intrin.h>
+#define cpuid(info,x) __cpuid(info,x)
+#else
+
+#if defined(CPU_INFO_BY_C)
+#include <cpuid.h>
+#endif
+
+static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
+{
+#if defined(CPU_INFO_BY_ASM)
+    __asm__ __volatile__ (
+        "cpuid":
+        "=a" (CPUInfo[0]),
+        "=b" (CPUInfo[1]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "a" (InfoType), "c" (0)
+    );
+#elif defined(CPU_INFO_BY_C)
+    __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+#endif
+}
+
+#endif
+
+#include "SigProc_FIX.h"
+#include "celt_lpc.h"
+
+typedef struct CPU_Feature{
+    /*  SIMD: 128-bit */
+    int HW_SSE2;
+    int HW_SSE41;
+} CPU_Feature;
+
+static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
+{
+    unsigned int info[4] = {0};
+    unsigned int nIds = 0;
+
+    cpuid(info, 0);
+    nIds = info[0];
+
+    if (nIds >= 1){
+        cpuid(info, 1);
+        cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
+        cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
+    }
+}
+
+int opus_select_arch(void)
+{
+    CPU_Feature cpu_feature = {0};
+    int arch;
+
+    opus_cpu_feature_check(&cpu_feature);
+
+    arch = 0;
+    if (!cpu_feature.HW_SSE2)
+    {
+       return arch;
+    }
+    arch++;
+
+    if (!cpu_feature.HW_SSE41)
+    {
+        return arch;
+    }
+    arch++;
+
+    return arch;
+}

diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
new file mode 100644
index 0000000..2394b05
--- /dev/null
+++ b/celt/x86/x86cpu.h

@@ -0,0 +1,63 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(X86CPU_H)
+# define X86CPU_H
+
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+#  define MAY_HAVE_SSE2(name) name ## _sse2
+# else
+#  define MAY_HAVE_SSE2(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#  define MAY_HAVE_SSE4_1(name) name ## _sse4_1
+# else
+#  define MAY_HAVE_SSE4_1(name) name ## _c
+# endif
+
+# if defined(OPUS_HAVE_RTCD)
+int opus_select_arch(void);
+# endif
+
+/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi16_epi32()
+  when optimizations are disabled, even though the actual PMOVSXWD instruction
+  takes an m64. Unlike a normal m64 reference, these require 16-byte alignment
+  and load 16 bytes instead of 8, possibly reading out of bounds.
+
+  We can insert an explicit MOVQ using _mm_loadl_epi64(), which should have the
+  same semantics as an m64 reference in the PMOVSXWD instruction itself, but
+  gcc is not smart enough to optimize this out when optimizations ARE enabled.*/
+# if !defined(__OPTIMIZE__)
+#  define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+# else
+#  define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(*(__m128i *)(x)))
+# endif
+
+#endif
commit	c95c9a048f3283afb2e412b10085d4f7c19e1412	[log] [tgz]
author	xiangmingzhu <xiangzhu@cisco.com>	Wed Apr 30 15:48:07 2014 +0800
committer	Jean-Marc Valin <jmvalin@jmvalin.ca>	Fri Oct 03 21:16:00 2014 -0400
tree	ed8873af6559d7a98922e0fed85be47c826ef521
parent	80460334b77d70e665a390503cd8992cdad06c10 [diff]