Decision on whether to use pitch is now taken only based on energy in the
first three bands (instead of total MDCT window energy)
diff --git a/libcelt/arch.h b/libcelt/arch.h
index f52fb12..0e41cce 100644
--- a/libcelt/arch.h
+++ b/libcelt/arch.h
@@ -80,6 +80,7 @@
 
 #define ENER_SCALING 16384.f
 #define ENER_SCALING_1 (1.f/16384.f)
+#define ENER_SHIFT 14
 
 #define PGAIN_SCALING 32768.f
 #define PGAIN_SCALING_1 (1.f/32768.f)
diff --git a/libcelt/celt.c b/libcelt/celt.c
index 5f08836..e602529 100644
--- a/libcelt/celt.c
+++ b/libcelt/celt.c
@@ -153,10 +153,9 @@
 }
 
 /** Apply window and compute the MDCT for all sub-frames and all channels in a frame */
-static celt_word32_t compute_mdcts(const mdct_lookup *lookup, const celt_word16_t * restrict window, celt_sig_t * restrict in, celt_sig_t * restrict out, int N, int overlap, int C)
+static void compute_mdcts(const mdct_lookup *lookup, const celt_word16_t * restrict window, celt_sig_t * restrict in, celt_sig_t * restrict out, int N, int overlap, int C)
 {
    int c, N4;
-   celt_word32_t E = 0;
    VARDECL(celt_word32_t, x);
    VARDECL(celt_word32_t, tmp);
    SAVE_STACK;
@@ -183,15 +182,12 @@
          x[j] = 0;
          x[2*N-j-1] = 0;
       }
-      for (j=0;j<2*N;j++)
-         E += MULT16_16(EXTRACT16(SHR32(x[j],SIG_SHIFT+4)),EXTRACT16(SHR32(x[j],SIG_SHIFT+4)));
       mdct_forward(lookup, x, tmp);
       /* Interleaving the sub-frames */
       for (j=0;j<N;j++)
          out[C*j+c] = tmp[j];
    }
    RESTORE_STACK;
-   return E;
 }
 
 /** Compute the IMDCT and apply window for all sub-frames and all channels in a frame */
@@ -266,7 +262,7 @@
    
    /*for (i=0;i<(B+1)*C*N;i++) printf ("%f(%d) ", in[i], i); printf ("\n");*/
    /* Compute MDCTs */
-   curr_power = compute_mdcts(&st->mode->mdct, st->mode->window, in, freq, N, st->overlap, C);
+   compute_mdcts(&st->mode->mdct, st->mode->window, in, freq, N, st->overlap, C);
 
 #if 0 /* Mask disabled until it can be made to do something useful */
    compute_mdct_masking(X, mask, B*C*N, st->Fs);
@@ -299,7 +295,7 @@
    /*for (i=0;i<N*B*C;i++)printf("%f ", X[i]);printf("\n");*/
 
    /* Compute MDCTs of the pitch part */
-   pitch_power = compute_mdcts(&st->mode->mdct, st->mode->window, st->out_mem+pitch_index*C, freq, N, st->overlap, C);
+   compute_mdcts(&st->mode->mdct, st->mode->window, st->out_mem+pitch_index*C, freq, N, st->overlap, C);
    
 
    quant_energy(st->mode, bandE, st->oldBandE, nbCompressedBytes*8/3, &st->enc);
@@ -309,15 +305,18 @@
       stereo_mix(st->mode, X, bandE, 1);
    }
 
-   /* Check if we can safely use the pitch (i.e. effective gain isn't too high) */
-   if (MULT16_32_Q15(QCONST16(.1f, 15),curr_power) + SHR16(10000,8) < pitch_power)
    {
       /* Normalise the pitch vector as well (discard the energies) */
       VARDECL(celt_ener_t, bandEp);
       ALLOC(bandEp, st->mode->nbEBands*st->mode->nbChannels, celt_ener_t);
       compute_band_energies(st->mode, freq, bandEp);
       normalise_bands(st->mode, freq, P, bandEp);
-
+      pitch_power = bandEp[0]+bandEp[1]+bandEp[2];
+   }
+   curr_power = bandE[0]+bandE[1]+bandE[2];
+   /* Check if we can safely use the pitch (i.e. effective gain isn't too high) */
+   if (MULT16_32_Q15(QCONST16(.1f, 15),curr_power) + QCONST32(10.f,ENER_SHIFT) < pitch_power)
+   {
       if (C==2)
          stereo_mix(st->mode, P, bandE, 1);
       /* Simulates intensity stereo */
diff --git a/libcelt/mdct.c b/libcelt/mdct.c
index cdfb94a..da67f91 100644
--- a/libcelt/mdct.c
+++ b/libcelt/mdct.c
@@ -105,6 +105,8 @@
       /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
       re = -HALF32(in[N2+N4+2*i] + in[N2+N4-2*i-1]);
       im = -HALF32(in[N4+2*i]    - in[N4-2*i-1]);
+      /* We could remove the HALF32 above and just use MULT16_32_Q16 below
+         (MIXED_PRECISION only) */
       out[2*i]   = S_MUL(re,l->trig[i])  -  S_MUL(im,l->trig[i+N4]);
       out[2*i+1] = S_MUL(im,l->trig[i])  +  S_MUL(re,l->trig[i+N4]);
    }
@@ -114,6 +116,8 @@
       /* Real part arranged as a-bR, Imag part arranged as -c-dR */
       re =  HALF32(in[2*i-N4] - in[N2+N4-2*i-1]);
       im = -HALF32(in[N4+2*i] + in[N+N4-2*i-1]);
+      /* We could remove the HALF32 above and just use MULT16_32_Q16 below
+         (MIXED_PRECISION only) */
       out[2*i]   = S_MUL(re,l->trig[i])  -  S_MUL(im,l->trig[i+N4]);
       out[2*i+1] = S_MUL(im,l->trig[i])  +  S_MUL(re,l->trig[i+N4]);
    }
diff --git a/libcelt/vq.c b/libcelt/vq.c
index ea89316..3a6494e 100644
--- a/libcelt/vq.c
+++ b/libcelt/vq.c
@@ -167,7 +167,7 @@
             /* Temporary sums of the new pulse(s) */
             Rxy = xy + MULT16_16(s,X[j]);
             Ryy = yy + 2*MULT16_16(s,y[j]) + MULT16_16(s,s);
-            Ryp = yp + MULT16_16(s, P[j]);
+            /* This score is approximate, but good enough for the first pulses */
             scores[j] = MULT32_32_Q31(MULT16_16(ROUND16(Rxy,14),ABS16(ROUND16(Rxy,14))), celt_rcp(SHR32(Ryy,12)));
          }
       } else {