Implements basic surround masking

The idea is that the rate of each stream is adjusted based on its
contribution to the total energy of a stereo downmix.
diff --git a/celt/celt.h b/celt/celt.h
index a8f7cb0..ea8c2f9 100644
--- a/celt/celt.h
+++ b/celt/celt.h
@@ -110,6 +110,12 @@
 #define OPUS_SET_LFE_REQUEST    10024
 #define OPUS_SET_LFE(x) OPUS_SET_LFE_REQUEST, __opus_check_int(x)
 
+#define OPUS_SET_ENERGY_SAVE_REQUEST    10026
+#define OPUS_SET_ENERGY_SAVE(x) OPUS_SET_ENERGY_SAVE_REQUEST, __opus_check_val16_ptr(x)
+
+#define OPUS_SET_ENERGY_MASK_REQUEST    10028
+#define OPUS_SET_ENERGY_MASK(x) OPUS_SET_ENERGY_MASK_REQUEST, __opus_check_val16_ptr(x)
+
 /* Encoder stuff */
 
 int celt_encoder_get_size(int channels);
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index 6ac2457..2030ad0 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -109,6 +109,8 @@
    opus_val16 overlap_max;
    opus_val16 stereo_saving;
    int intensity;
+   opus_val16 *energy_save;
+   opus_val16 *energy_mask;
 
 #ifdef RESYNTH
    /* +MAX_PERIOD/2 to make space for overlap */
@@ -1165,6 +1167,7 @@
    int secondMdct;
    int signalBandwidth;
    int transient_got_disabled=0;
+   opus_val16 surround_masking=0;
    ALLOC_STACK;
 
    mode = st->mode;
@@ -1397,6 +1400,27 @@
          bandE[i] = IMIN(bandE[i], MULT16_32_Q15(QCONST16(1e-4f,15),bandE[0]));
    }
    amp2Log2(mode, effEnd, st->end, bandE, bandLogE, C);
+   if (st->energy_save)
+   {
+      opus_val16 offset = shortBlocks?HALF16(SHL16(LM, DB_SHIFT)):0;
+#ifdef FIXED_POINT
+      /* Compensate for the 1/8 gain we apply in the fixed-point downshift to avoid overflows. */
+      offset -= QCONST16(3.0f, DB_SHIFT);
+#endif
+      for(i=0;i<C*nbEBands;i++)
+         st->energy_save[i]=bandLogE[i]-offset;
+      st->energy_save=NULL;
+   }
+   if (st->energy_mask&&!st->lfe)
+   {
+      opus_val32 mask_avg=0;
+      opus_val16 offset = shortBlocks?HALF16(SHL16(LM, DB_SHIFT)):0;
+      for (c=0;c<C;c++)
+         for(i=0;i<st->end;i++)
+            mask_avg += bandLogE[nbEBands*c+i]-offset-st->energy_mask[nbEBands*c+i];
+      surround_masking = DIV32_16(mask_avg,C*st->end) + QCONST16(.0f, DB_SHIFT);
+      surround_masking = MIN16(MAX16(surround_masking,-QCONST16(1.5f, DB_SHIFT)), 0);
+   }
    /*for (i=0;i<21;i++)
       printf("%f ", bandLogE[i]);
    printf("\n");*/
@@ -1625,7 +1649,7 @@
         max_frac = DIV32_16(MULT16_16(QCONST16(0.8f, 15), coded_stereo_dof), coded_bins);
         /*printf("%d %d %d ", coded_stereo_dof, coded_bins, tot_boost);*/
         target -= (opus_int32)MIN32(MULT16_32_Q15(max_frac,target),
-                        SHR16(MULT16_16(st->stereo_saving-QCONST16(0.1f,8),(coded_stereo_dof<<BITRES)),8));
+                        SHR32(MULT16_16(st->stereo_saving-QCONST16(0.1f,8),(coded_stereo_dof<<BITRES)),8));
      }
      /* Boost the rate according to dynalloc (minus the dynalloc average for calibration). */
      target += tot_boost-(16<<LM);
@@ -1649,6 +1673,13 @@
      }
 #endif
 
+     if (st->energy_mask&&!st->lfe)
+     {
+        opus_int32 surround_target = target + SHR32(MULT16_16(surround_masking,coded_bins<<BITRES), DB_SHIFT);
+        /*printf("%f %d %d %d %d %d %d ", surround_masking, coded_bins, st->end, st->intensity, surround_target, target, st->bitrate);*/
+        target = IMAX(target/4, surround_target);
+     }
+
      {
         opus_int32 floor_depth;
         int bins;
@@ -1660,7 +1691,7 @@
         /*printf("%f %d\n", maxDepth, floor_depth);*/
      }
 
-     if (st->constrained_vbr || st->bitrate<64000)
+     if ((!st->energy_mask||st->lfe) && (st->constrained_vbr || st->bitrate<64000))
      {
         opus_val16 rate_factor;
 #ifdef FIXED_POINT
@@ -1759,7 +1790,10 @@
    codedBands = compute_allocation(mode, st->start, st->end, offsets, cap,
          alloc_trim, &st->intensity, &dual_stereo, bits, &balance, pulses,
          fine_quant, fine_priority, C, LM, enc, 1, st->lastCodedBands, signalBandwidth);
-   st->lastCodedBands = codedBands;
+   if (st->lastCodedBands)
+      st->lastCodedBands = IMIN(st->lastCodedBands+1,IMAX(st->lastCodedBands-1,codedBands));
+   else
+      st->lastCodedBands = codedBands;
 
    quant_fine_energy(mode, st->start, st->end, oldBandE, error, fine_quant, enc, C);
 
@@ -2151,6 +2185,18 @@
           st->lfe = value;
       }
       break;
+      case OPUS_SET_ENERGY_SAVE_REQUEST:
+      {
+          opus_val16 *value = va_arg(ap, opus_val16*);
+          st->energy_save=value;
+      }
+      break;
+      case OPUS_SET_ENERGY_MASK_REQUEST:
+      {
+          opus_val16 *value = va_arg(ap, opus_val16*);
+          st->energy_mask = value;
+      }
+      break;
       default:
          goto bad_request;
    }
diff --git a/include/opus_defines.h b/include/opus_defines.h
index 203144a..00918b8 100644
--- a/include/opus_defines.h
+++ b/include/opus_defines.h
@@ -158,6 +158,7 @@
 #define __opus_check_int(x) (((void)((x) == (opus_int32)0)), (opus_int32)(x))
 #define __opus_check_int_ptr(ptr) ((ptr) + ((ptr) - (opus_int32*)(ptr)))
 #define __opus_check_uint_ptr(ptr) ((ptr) + ((ptr) - (opus_uint32*)(ptr)))
+#define __opus_check_val16_ptr(ptr) ((ptr) + ((ptr) - (opus_val16*)(ptr)))
 /** @endcond */
 
 /** @defgroup opus_ctlvalues Pre-defined values for CTL interface
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 235f557..e2a6347 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -94,6 +94,7 @@
     int          silk_bw_switch;
     /* Sampling rate (at the API level) */
     int          first;
+    int          energy_masking;
     StereoWidthState width_mem;
     opus_val16   delay_buffer[MAX_ENCODER_BUFFER*2];
 #ifndef FIXED_POINT
@@ -1602,7 +1603,7 @@
     st->prev_HB_gain = HB_gain;
     if (st->mode != MODE_HYBRID || st->stream_channels==1)
        st->silk_mode.stereoWidth_Q14 = IMIN((1<<14),IMAX(0,st->bitrate_bps-32000));
-    if( st->channels == 2 ) {
+    if( !st->energy_masking && st->channels == 2 ) {
         /* Apply stereo width reduction (at low bitrates) */
         if( st->hybrid_stereo_width_Q14 < (1 << 14) || st->silk_mode.stereoWidth_Q14 < (1 << 14) ) {
             opus_val16 g1, g2;
@@ -2216,6 +2217,19 @@
             celt_encoder_ctl(celt_enc, OPUS_SET_LFE(value));
         }
         break;
+        case OPUS_SET_ENERGY_SAVE_REQUEST:
+        {
+            opus_val16 *value = va_arg(ap, opus_val16*);
+            celt_encoder_ctl(celt_enc, OPUS_SET_ENERGY_SAVE(value));
+        }
+        break;
+        case OPUS_SET_ENERGY_MASK_REQUEST:
+        {
+            opus_val16 *value = va_arg(ap, opus_val16*);
+            st->energy_masking = (value!=NULL);
+            celt_encoder_ctl(celt_enc, OPUS_SET_ENERGY_MASK(value));
+        }
+        break;
 
         case CELT_GET_MODE_REQUEST:
         {
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index d94aa70..163e73c 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -61,6 +61,7 @@
    ChannelLayout layout;
    int lfe_stream;
    int variable_duration;
+   int surround;
    opus_int32 bitrate_bps;
    opus_val32 subframe_mem[3];
    /* Encoder states go here */
@@ -104,6 +105,7 @@
 {
    int nb_streams;
    int nb_coupled_streams;
+   opus_int32 size;
 
    if (mapping_family==0)
    {
@@ -127,7 +129,10 @@
       nb_coupled_streams=0;
    } else
       return 0;
-   return opus_multistream_encoder_get_size(nb_streams, nb_coupled_streams);
+   size = opus_multistream_encoder_get_size(nb_streams, nb_coupled_streams);
+   if (channels>2)
+      size += align(opus_encoder_get_size(2));
+   return size;
 }
 
 
@@ -171,9 +176,9 @@
    for (i=0;i<st->layout.nb_coupled_streams;i++)
    {
       ret = opus_encoder_init((OpusEncoder*)ptr, Fs, 2, application);
+      if(ret!=OPUS_OK)return ret;
       if (i==st->lfe_stream)
          opus_encoder_ctl((OpusEncoder*)ptr, OPUS_SET_LFE(1));
-      if(ret!=OPUS_OK)return ret;
       ptr += align(coupled_size);
    }
    for (;i<st->layout.nb_streams;i++)
@@ -184,6 +189,14 @@
       if(ret!=OPUS_OK)return ret;
       ptr += align(mono_size);
    }
+   if (surround && st->layout.nb_channels>2)
+   {
+      OpusEncoder *downmix_enc;
+      downmix_enc = (OpusEncoder*)ptr;
+      ret = opus_encoder_init(downmix_enc, Fs, 2, OPUS_APPLICATION_AUDIO);
+      if(ret!=OPUS_OK)return ret;
+   }
+   st->surround = surround;
    return OPUS_OK;
 }
 
@@ -332,6 +345,13 @@
   int frame_size
 );
 
+typedef void (*opus_surround_downmix_funct)(
+  opus_val16 *dst,
+  const void *src,
+  int channels,
+  int frame_size
+);
+
 static void surround_rate_allocation(
       OpusMSEncoder *st,
       opus_int32 *rate,
@@ -398,7 +418,8 @@
     int frame_size,
     unsigned char *data,
     opus_int32 max_data_bytes,
-    int lsb_depth
+    int lsb_depth,
+    opus_surround_downmix_funct surround_downmix
 #ifndef FIXED_POINT
     , downmix_func downmix
     , const void *pcm_analysis
@@ -418,6 +439,8 @@
    AnalysisInfo analysis_info;
    const CELTMode *celt_mode;
    opus_int32 bitrates[256];
+   opus_val16 bandLogE[42];
+   opus_val16 bandLogE_mono[21];
    ALLOC_STACK;
 
    ptr = (char*)st + align(sizeof(OpusMSEncoder));
@@ -461,6 +484,36 @@
    coupled_size = opus_encoder_get_size(2);
    mono_size = opus_encoder_get_size(1);
 
+   if (st->surround && st->layout.nb_channels>2)
+   {
+      int i;
+      unsigned char dummy[512];
+      /* Temporary kludge -- remove */
+      OpusEncoder *downmix_enc;
+
+      ptr = (char*)st + align(sizeof(OpusMSEncoder));
+      for (s=0;s<st->layout.nb_streams;s++)
+      {
+         if (s < st->layout.nb_coupled_streams)
+            ptr += align(coupled_size);
+         else
+            ptr += align(mono_size);
+      }
+      downmix_enc = (OpusEncoder*)ptr;
+      surround_downmix(buf, pcm, st->layout.nb_channels, frame_size);
+      opus_encoder_ctl(downmix_enc, OPUS_SET_ENERGY_SAVE(bandLogE));
+      opus_encoder_ctl(downmix_enc, OPUS_SET_BANDWIDTH(OPUS_BANDWIDTH_FULLBAND));
+      opus_encoder_ctl(downmix_enc, OPUS_SET_FORCE_MODE(MODE_CELT_ONLY));
+      opus_encoder_ctl(downmix_enc, OPUS_SET_FORCE_CHANNELS(2));
+      opus_encode_native(downmix_enc, buf, frame_size, dummy, 512, lsb_depth
+#ifndef FIXED_POINT
+            , &analysis_info
+#endif
+            );
+      for(i=0;i<21;i++)
+         bandLogE_mono[i] = MAX16(bandLogE[i], bandLogE[21+i]);
+   }
+
    if (max_data_bytes < 4*st->layout.nb_streams-1)
    {
       RESTORE_STACK;
@@ -480,6 +533,13 @@
       else
          ptr += align(mono_size);
       opus_encoder_ctl(enc, OPUS_SET_BITRATE(bitrates[s]));
+      if (st->surround)
+      {
+         opus_encoder_ctl(enc, OPUS_SET_FORCE_MODE(MODE_CELT_ONLY));
+         opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(OPUS_BANDWIDTH_FULLBAND));
+         if (s < st->layout.nb_coupled_streams)
+            opus_encoder_ctl(enc, OPUS_SET_FORCE_CHANNELS(2));
+      }
    }
 
    ptr = (char*)st + align(sizeof(OpusMSEncoder));
@@ -503,11 +563,17 @@
          (*copy_channel_in)(buf+1, 2,
             pcm, st->layout.nb_channels, right, frame_size);
          ptr += align(coupled_size);
+         /* FIXME: This isn't correct for the coupled center channels in
+            6.1 surround configuration */
+         if (st->surround)
+            opus_encoder_ctl(enc, OPUS_SET_ENERGY_MASK(bandLogE));
       } else {
          int chan = get_mono_channel(&st->layout, s, -1);
          (*copy_channel_in)(buf, 1,
             pcm, st->layout.nb_channels, chan, frame_size);
          ptr += align(mono_size);
+         if (st->surround)
+            opus_encoder_ctl(enc, OPUS_SET_ENERGY_MASK(bandLogE_mono));
       }
       /* number of bytes left (+Toc) */
       curr_max = max_data_bytes - tot_size;
@@ -557,6 +623,85 @@
       dst[i*dst_stride] = float_src[i*src_stride+src_channel];
 #endif
 }
+
+static void channel_pos(int channels, int pos[8])
+{
+   /* Position in the mix: 0 don't mix, 1: left, 2: center, 3:right */
+   if (channels==4)
+   {
+      pos[0]=1;
+      pos[1]=3;
+      pos[2]=1;
+      pos[3]=3;
+   } else if (channels==3||channels==5||channels==6)
+   {
+      pos[0]=1;
+      pos[1]=2;
+      pos[2]=3;
+      pos[3]=1;
+      pos[4]=3;
+      pos[5]=0;
+   } else if (channels==7)
+   {
+      pos[0]=1;
+      pos[1]=2;
+      pos[2]=3;
+      pos[3]=1;
+      pos[4]=3;
+      pos[5]=2;
+      pos[6]=0;
+   } else if (channels==8)
+   {
+      pos[0]=1;
+      pos[1]=2;
+      pos[2]=3;
+      pos[3]=1;
+      pos[4]=3;
+      pos[5]=1;
+      pos[6]=3;
+      pos[7]=0;
+   }
+}
+
+static void opus_surround_downmix_float(
+  opus_val16 *dst,
+  const void *src,
+  int channels,
+  int frame_size
+)
+{
+   const float *float_src;
+   opus_int32 i;
+   int pos[8] = {0};
+   int c;
+   float_src = (const float *)src;
+
+   channel_pos(channels, pos);
+   for (i=0;i<2*frame_size;i++)
+      dst[i]=0;
+
+   for (c=0;c<channels;c++)
+   {
+      if (pos[c]==1||pos[c]==2)
+      {
+         for (i=0;i<frame_size;i++)
+#if defined(FIXED_POINT)
+            dst[2*i] += SHR16(FLOAT2INT16(float_src[i*channels+c]),3);
+#else
+            dst[2*i] += float_src[i*channels+c];
+#endif
+      }
+      if (pos[c]==2||pos[c]==3)
+      {
+         for (i=0;i<frame_size;i++)
+#if defined(FIXED_POINT)
+            dst[2*i+1] += SHR16(FLOAT2INT16(float_src[i*channels+c]),3);
+#else
+            dst[2*i+1] += float_src[i*channels+c];
+#endif
+      }
+   }
+}
 #endif
 
 static void opus_copy_channel_in_short(
@@ -579,6 +724,47 @@
 #endif
 }
 
+static void opus_surround_downmix_short(
+  opus_val16 *dst,
+  const void *src,
+  int channels,
+  int frame_size
+)
+{
+   const opus_int16 *short_src;
+   opus_int32 i;
+   int pos[8] = {0};
+   int c;
+   short_src = (const opus_int16 *)src;
+
+   channel_pos(channels, pos);
+   for (i=0;i<2*frame_size;i++)
+      dst[i]=0;
+
+   for (c=0;c<channels;c++)
+   {
+      if (pos[c]==1||pos[c]==2)
+      {
+         for (i=0;i<frame_size;i++)
+#if defined(FIXED_POINT)
+            dst[2*i] += SHR16(short_src[i*channels+c],3);
+#else
+            dst[2*i] += (1/32768.f)*short_src[i*channels+c];
+#endif
+      }
+      if (pos[c]==2||pos[c]==3)
+      {
+         for (i=0;i<frame_size;i++)
+#if defined(FIXED_POINT)
+            dst[2*i+1] += SHR16(short_src[i*channels+c],3);
+#else
+            dst[2*i+1] += (1/32768.f)*short_src[i*channels+c];
+#endif
+      }
+   }
+}
+
+
 #ifdef FIXED_POINT
 int opus_multistream_encode(
     OpusMSEncoder *st,
@@ -589,7 +775,7 @@
 )
 {
    return opus_multistream_encode_native(st, opus_copy_channel_in_short,
-      pcm, frame_size, data, max_data_bytes, 16);
+      pcm, frame_size, data, max_data_bytes, 16, opus_surround_downmix_float);
 }
 
 #ifndef DISABLE_FLOAT_API
@@ -602,7 +788,7 @@
 )
 {
    return opus_multistream_encode_native(st, opus_copy_channel_in_float,
-      pcm, frame_size, data, max_data_bytes, 16);
+      pcm, frame_size, data, max_data_bytes, 16, opus_surround_downmix_short);
 }
 #endif
 
@@ -619,7 +805,7 @@
 {
    int channels = st->layout.nb_streams + st->layout.nb_coupled_streams;
    return opus_multistream_encode_native(st, opus_copy_channel_in_float,
-      pcm, frame_size, data, max_data_bytes, 24, downmix_float, pcm+channels*st->analysis.analysis_offset);
+      pcm, frame_size, data, max_data_bytes, 24, opus_surround_downmix_float, downmix_float, pcm+channels*st->analysis.analysis_offset);
 }
 
 int opus_multistream_encode(
@@ -632,7 +818,7 @@
 {
    int channels = st->layout.nb_streams + st->layout.nb_coupled_streams;
    return opus_multistream_encode_native(st, opus_copy_channel_in_short,
-      pcm, frame_size, data, max_data_bytes, 16, downmix_int, pcm+channels*st->analysis.analysis_offset);
+      pcm, frame_size, data, max_data_bytes, 16, opus_surround_downmix_short, downmix_int, pcm+channels*st->analysis.analysis_offset);
 }
 #endif