armv7(float): Optimize encode usecase using NE10 library

Optimize opus encode (float only) usecase using ARM NE10
library. Mainly effects opus_fft and ctl_mdct_forward
and related functions.

This optimization can be used for ARM CPUs that have NEON
VFP unit. This patch only enables optimizations for ARMv7.

Official ARM NE10 library page available at
http://projectne10.github.io/Ne10/

To enable this optimization, use
--enable-intrinsics --with-NE10=<install_prefix>
or
--enable-intrinsics --with-NE10-libraries=<NE10_lib_dir> --with-NE10-includes=<NE10_includes_dir>

Compile time checks made during configure process to make sure
optimization option available only when compiler supports NEON
instrinsics.

Runtime checks made to make sure optimized functions only called
on appropriate hardware.

Signed-off-by: Timothy B. Terriberry <tterribe@xiph.org>
diff --git a/src/analysis.c b/src/analysis.c
index 401a43e..322e53c 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -187,7 +187,7 @@
    info_out->music_prob = psum;
 }
 
-static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
+static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix, int arch)
 {
     int i, b;
     const kiss_fft_state *kfft;
@@ -260,7 +260,7 @@
     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
     tonal->mem_fill = 240 + remaining;
-    opus_fft(kfft, in, out);
+    opus_fft(kfft, in, out, arch);
 #ifndef FIXED_POINT
     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
     if (celt_isnan(out[0].r))
@@ -633,7 +633,7 @@
 
 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
-                 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
+                 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info, int arch)
 {
    int offset;
    int pcm_len;
@@ -646,7 +646,7 @@
       pcm_len = analysis_frame_size - analysis->analysis_offset;
       offset = analysis->analysis_offset;
       do {
-         tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
+         tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix, arch);
          offset += 480;
          pcm_len -= 480;
       } while (pcm_len>0);
diff --git a/src/analysis.h b/src/analysis.h
index 85a73d7..9c328e8 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -82,6 +82,6 @@
 
 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
-                 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info);
+                 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info, int arch);
 
 #endif
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index d11e972..9dbe4bf 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -1006,7 +1006,7 @@
        analysis_read_subframe_bak = st->analysis.read_subframe;
        run_analysis(&st->analysis, celt_mode, analysis_pcm, analysis_size, frame_size,
              c1, c2, analysis_channels, st->Fs,
-             lsb_depth, downmix, &analysis_info);
+             lsb_depth, downmix, &analysis_info, st->arch);
     }
 #else
     (void)analysis_pcm;
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index aa6a267..9e85773 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -72,6 +72,7 @@
 
 struct OpusMSEncoder {
    ChannelLayout layout;
+   int arch;
    int lfe_stream;
    int application;
    int variable_duration;
@@ -221,7 +222,7 @@
 #endif
 
 void surround_analysis(const CELTMode *celt_mode, const void *pcm, opus_val16 *bandLogE, opus_val32 *mem, opus_val32 *preemph_mem,
-      int len, int overlap, int channels, int rate, opus_copy_channel_in_func copy_channel_in
+      int len, int overlap, int channels, int rate, opus_copy_channel_in_func copy_channel_in, int arch
 )
 {
    int c;
@@ -273,7 +274,8 @@
          }
       }
 #endif
-      clt_mdct_forward(&celt_mode->mdct, in, freq, celt_mode->window, overlap, celt_mode->maxLM-LM, 1);
+      clt_mdct_forward(&celt_mode->mdct, in, freq, celt_mode->window,
+            overlap, celt_mode->maxLM-LM, 1, arch);
       if (upsample != 1)
       {
          int bound = len;
@@ -427,6 +429,7 @@
        (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
       return OPUS_BAD_ARG;
 
+   st->arch = opus_select_arch();
    st->layout.nb_channels = channels;
    st->layout.nb_streams = streams;
    st->layout.nb_coupled_streams = coupled_streams;
@@ -783,7 +786,7 @@
    ALLOC(bandSMR, 21*st->layout.nb_channels, opus_val16);
    if (st->surround)
    {
-      surround_analysis(celt_mode, pcm, bandSMR, mem, preemph_mem, frame_size, 120, st->layout.nb_channels, Fs, copy_channel_in);
+      surround_analysis(celt_mode, pcm, bandSMR, mem, preemph_mem, frame_size, 120, st->layout.nb_channels, Fs, copy_channel_in, st->arch);
    }
 
    /* Compute bitrate allocation between streams (this could be a lot better) */