Use -msse2 for SSE2 optimized code.

When targeting 32-bit Linux, we need to pass -msse2 to gcc to compile
SSE2 intrinsics. However, -msse2 also gives gcc license to automatically
generate SSE2 instructions wherever it pleases. This will crash our code
on processors without SSE2 support.

This change breaks the files with SSE2 intrinsics into separate targets,
such that we can limit the scope of -msse2 to where it's needed.

We no longer need to employ the WEBRTC_USE_SSE2 define; the build system
decides when SSE2 is supported and compiles the appropriate files.

TBR=bjornv@webrtc.org
TEST=audioproc (performance testing), audioproc_unittest, video_processing_unittests, build on Linux (targeting ia32/x64, with disable_sse2==0/1), Mac, Windows

Review URL: http://webrtc-codereview.appspot.com/352008

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1425 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/android-webrtc.mk b/android-webrtc.mk
index dc92aeb..0dae14c 100644
--- a/android-webrtc.mk
+++ b/android-webrtc.mk
@@ -42,8 +42,3 @@
 MY_WEBRTC_COMMON_DEFS += \
     '-DWEBRTC_ARCH_ARM_V7A'
 endif
-
-else ifeq ($(TARGET_ARCH),x86)
-MY_WEBRTC_COMMON_DEFS += \
-    '-DWEBRTC_USE_SSE2'
-endif
diff --git a/src/modules/audio_processing/aec/Android.mk b/src/modules/audio_processing/aec/Android.mk
index 698755a..7d53958 100644
--- a/src/modules/audio_processing/aec/Android.mk
+++ b/src/modules/audio_processing/aec/Android.mk
@@ -20,9 +20,12 @@
     aec_resampler.c \
     aec_core.c \
     aec_rdft.c \
+
+ifeq ($(TARGET_ARCH),x86)
+LOCAL_SRC_FILES += \
     aec_core_sse2.c \
     aec_rdft_sse2.c
-
+endif
 
 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
diff --git a/src/modules/audio_processing/aec/aec.gypi b/src/modules/audio_processing/aec/aec.gypi
index 7e86a90..4b3a08d 100644
--- a/src/modules/audio_processing/aec/aec.gypi
+++ b/src/modules/audio_processing/aec/aec.gypi
@@ -16,8 +16,8 @@
         'aec_debug_dump%': 0,
       },
       'dependencies': [
+        'apm_util',
         '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
-        'apm_util'
       ],
       'include_dirs': [
         'interface',
@@ -32,18 +32,37 @@
         'echo_cancellation.c',
         'aec_core.h',
         'aec_core.c',
-        'aec_core_sse2.c',
         'aec_rdft.h',
         'aec_rdft.c',
-        'aec_rdft_sse2.c',
         'aec_resampler.h',
         'aec_resampler.c',
       ],
       'conditions': [
+        ['target_arch=="ia32" or target_arch=="x64"', {
+          'dependencies': [ 'aec_sse2', ],
+        }],
         ['aec_debug_dump==1', {
           'defines': [ 'WEBRTC_AEC_DEBUG_DUMP', ],
         }],
       ],
     },
+    {
+      'target_name': 'aec_sse2',
+      'type': '<(library)',
+      'sources': [
+        'aec_core_sse2.c',
+        'aec_rdft_sse2.c',
+      ],
+      'conditions': [
+        ['os_posix==1 and OS!="mac"', {
+          'cflags': [ '-msse2', ],
+        }],
+        ['OS=="mac"', {
+          'xcode_settings': {
+            'OTHER_CFLAGS': [ '-msse2', ],
+          },
+        }],
+      ],
+    },
   ],
 }
diff --git a/src/modules/audio_processing/aec/aec_core.c b/src/modules/audio_processing/aec/aec_core.c
index 6718dec..1637e6f 100644
--- a/src/modules/audio_processing/aec/aec_core.c
+++ b/src/modules/audio_processing/aec/aec_core.c
@@ -21,6 +21,7 @@
 #include <string.h>
 
 #include "aec_rdft.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
 #include "delay_estimator_wrapper.h"
 #include "ring_buffer.h"
 #include "system_wrappers/interface/cpu_features_wrapper.h"
@@ -516,11 +517,13 @@
     WebRtcAec_ScaleErrorSignal = ScaleErrorSignal;
     WebRtcAec_FilterAdaptation = FilterAdaptation;
     WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
+
+#if defined(WEBRTC_ARCH_X86_FAMILY)
     if (WebRtc_GetCPUInfo(kSSE2)) {
-#if defined(WEBRTC_USE_SSE2)
       WebRtcAec_InitAec_SSE2();
-#endif
     }
+#endif
+
     aec_rdft_init();
 
     return 0;
diff --git a/src/modules/audio_processing/aec/aec_core.h b/src/modules/audio_processing/aec/aec_core.h
index 1b9828a..d326a68 100644
--- a/src/modules/audio_processing/aec/aec_core.h
+++ b/src/modules/audio_processing/aec/aec_core.h
@@ -15,9 +15,10 @@
 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
 #define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
 
+#ifdef WEBRTC_AEC_DEBUG_DUMP
 #include <stdio.h>
+#endif
 
-#include "signal_processing_library.h"
 #include "typedefs.h"
 
 #define FRAME_LEN 80
diff --git a/src/modules/audio_processing/aec/aec_core_sse2.c b/src/modules/audio_processing/aec/aec_core_sse2.c
index 8894f28..74a1c48 100644
--- a/src/modules/audio_processing/aec/aec_core_sse2.c
+++ b/src/modules/audio_processing/aec/aec_core_sse2.c
@@ -12,13 +12,12 @@
  * The core AEC algorithm, SSE2 version of speed-critical functions.
  */
 
-#include "typedefs.h"
+#include "aec_core.h"
 
-#if defined(WEBRTC_USE_SSE2)
 #include <emmintrin.h>
 #include <math.h>
+#include <string.h>  // memset
 
-#include "aec_core.h"
 #include "aec_rdft.h"
 
 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm)
@@ -414,4 +413,3 @@
   WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
 }
 
-#endif   // WEBRTC_USE_SSE2
diff --git a/src/modules/audio_processing/aec/aec_rdft.c b/src/modules/audio_processing/aec/aec_rdft.c
index 9222334..19908d8 100644
--- a/src/modules/audio_processing/aec/aec_rdft.c
+++ b/src/modules/audio_processing/aec/aec_rdft.c
@@ -576,11 +576,11 @@
   cftmdl_128 = cftmdl_128_C;
   rftfsub_128 = rftfsub_128_C;
   rftbsub_128 = rftbsub_128_C;
+#if defined(WEBRTC_ARCH_X86_FAMILY)
   if (WebRtc_GetCPUInfo(kSSE2)) {
-#if defined(WEBRTC_USE_SSE2)
     aec_rdft_init_sse2();
-#endif
   }
+#endif
   // init library constants.
   makewt_32();
   makect_32();
diff --git a/src/modules/audio_processing/aec/aec_rdft_sse2.c b/src/modules/audio_processing/aec/aec_rdft_sse2.c
index f936e2a..eeb3152 100644
--- a/src/modules/audio_processing/aec/aec_rdft_sse2.c
+++ b/src/modules/audio_processing/aec/aec_rdft_sse2.c
@@ -8,13 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "typedefs.h"
-
-#if defined(WEBRTC_USE_SSE2)
-#include <emmintrin.h>
-
 #include "aec_rdft.h"
 
+#include <emmintrin.h>
+
 static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] =
   {-1.f, 1.f, -1.f, 1.f};
 
@@ -428,4 +425,3 @@
   rftbsub_128 = rftbsub_128_SSE2;
 }
 
-#endif  // WEBRTC_USE_SS2
diff --git a/src/modules/audio_processing/aec/echo_cancellation.c b/src/modules/audio_processing/aec/echo_cancellation.c
index 66c9b97..021df05 100644
--- a/src/modules/audio_processing/aec/echo_cancellation.c
+++ b/src/modules/audio_processing/aec/echo_cancellation.c
@@ -22,6 +22,7 @@
 
 #include "aec_core.h"
 #include "aec_resampler.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
 #include "ring_buffer.h"
 #include "typedefs.h"
 
diff --git a/src/modules/video_processing/main/source/Android.mk b/src/modules/video_processing/main/source/Android.mk
index 03d2d74..74d15cb 100644
--- a/src/modules/video_processing/main/source/Android.mk
+++ b/src/modules/video_processing/main/source/Android.mk
@@ -18,7 +18,6 @@
 LOCAL_MODULE_TAGS := optional
 LOCAL_CPP_EXTENSION := .cc
 LOCAL_SRC_FILES := \
-    video_processing_impl.cc \
     brightness_detection.cc \
     color_enhancement.cc \
     content_analysis.cc \
@@ -27,6 +26,12 @@
     frame_preprocessor.cc \
     spatial_resampler.cc \
     video_decimator.cc
+    video_processing_impl.cc \
+
+ifeq ($(TARGET_ARCH),x86)
+LOCAL_SRC_FILES += \
+    content_analysis_sse2.cc
+endif
 
 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
diff --git a/src/modules/video_processing/main/source/content_analysis.cc b/src/modules/video_processing/main/source/content_analysis.cc
index 45935eb..32ee09a 100644
--- a/src/modules/video_processing/main/source/content_analysis.cc
+++ b/src/modules/video_processing/main/source/content_analysis.cc
@@ -13,12 +13,10 @@
 
 #include <math.h>
 #include <stdlib.h>
-#if defined(WEBRTC_USE_SSE2)
-#include <emmintrin.h>
-#endif
+
 namespace webrtc {
 
-VPMContentAnalysis::VPMContentAnalysis(bool RTCD):
+VPMContentAnalysis::VPMContentAnalysis(bool runtime_cpu_detection):
 _origFrame(NULL),
 _prevFrame(NULL),
 _width(0),
@@ -40,16 +38,16 @@
     ComputeSpatialMetrics = &VPMContentAnalysis::ComputeSpatialMetrics_C;
     TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_C;
 
-    if (RTCD)
+    if (runtime_cpu_detection)
     {
-        if(WebRtc_GetCPUInfo(kSSE2))
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+        if (WebRtc_GetCPUInfo(kSSE2))
         {
-#if defined(WEBRTC_USE_SSE2)
             ComputeSpatialMetrics =
                           &VPMContentAnalysis::ComputeSpatialMetrics_SSE2;
             TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_SSE2;
-#endif
         }
+#endif
     }
 
     Release();
@@ -249,110 +247,6 @@
 
 }
 
-#if defined(WEBRTC_USE_SSE2)
-WebRtc_Word32
-VPMContentAnalysis::TemporalDiffMetric_SSE2()
-{
-    WebRtc_UWord32 numPixels = 0;       // counter for # of pixels
-
-    const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border;
-    const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border;
-
-    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
-
-    __m128i sad_64   = _mm_setzero_si128();
-    __m128i sum_64   = _mm_setzero_si128();
-    __m128i sqsum_64 = _mm_setzero_si128();
-    const __m128i z  = _mm_setzero_si128();
-
-    for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum)
-    {
-        __m128i sqsum_32  = _mm_setzero_si128();
-
-        const WebRtc_UWord8 *lineO = imgBufO;
-        const WebRtc_UWord8 *lineP = imgBufP;
-
-        // Work on 16 pixels at a time.  For HD content with a width of 1920
-        // this loop will run ~67 times (depending on border).  Maximum for
-        // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit
-        // results which are then accumulated.  There is no chance of
-        // rollover for these two accumulators.
-        // o*o will have a maximum of 255*255 = 65025.  This will roll over
-        // a 16 bit accumulator as 67*65025 > 65535, but will fit in a
-        // 32 bit accumulator.
-        for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16)
-        {
-            const __m128i o = _mm_loadu_si128((__m128i*)(lineO));
-            const __m128i p = _mm_loadu_si128((__m128i*)(lineP));
-
-            lineO += 16;
-            lineP += 16;
-
-            // abs pixel difference between frames
-            sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p));
-
-            // sum of all pixels in frame
-            sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z));
-
-            // squared sum of all pixels in frame
-            const __m128i olo = _mm_unpacklo_epi8(o,z);
-            const __m128i ohi = _mm_unpackhi_epi8(o,z);
-
-            const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo);
-            const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi);
-
-            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo);
-            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi);
-        }
-
-        // Add to 64 bit running sum as to not roll over.
-        sqsum_64 = _mm_add_epi64(sqsum_64,
-                                _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z),
-                                              _mm_unpacklo_epi32(sqsum_32,z)));
-
-        imgBufO += _width * _skipNum;
-        imgBufP += _width * _skipNum;
-        numPixels += (width_end - _border);
-    }
-
-    WebRtc_Word64 sad_final_64[2];
-    WebRtc_Word64 sum_final_64[2];
-    WebRtc_Word64 sqsum_final_64[2];
-
-    // bring sums out of vector registers and into integer register
-    // domain, summing them along the way
-    _mm_store_si128 ((__m128i*)sad_final_64, sad_64);
-    _mm_store_si128 ((__m128i*)sum_final_64, sum_64);
-    _mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64);
-
-    const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1];
-    const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1];
-    const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1];
-
-    // default
-    _motionMagnitudeNZ = 0.0f;
-
-    if (tempDiffSum == 0)
-    {
-        return VPM_OK;
-    }
-
-    // normalize over all pixels
-    const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels);
-    const float pixelSumAvg = (float)pixelSum / (float)(numPixels);
-    const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels);
-    float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg);
-
-    if (contrast > 0.0)
-    {
-        contrast = sqrt(contrast);
-       _motionMagnitudeNZ = tempDiffAvg/contrast;
-    }
-
-    return VPM_OK;
-}
-#endif
-
 // Compute spatial metrics:
 // To reduce complexity, we compute the metric for a reduced set of points.
 // The spatial metrics are rough estimates of the prediction error cost for
@@ -427,172 +321,6 @@
     return VPM_OK;
 }
 
-#if defined(WEBRTC_USE_SSE2)
-WebRtc_Word32
-VPMContentAnalysis::ComputeSpatialMetrics_SSE2()
-{
-    const WebRtc_UWord8* imgBuf = _origFrame + _border*_width;
-    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
-
-    __m128i se_32  = _mm_setzero_si128();
-    __m128i sev_32 = _mm_setzero_si128();
-    __m128i seh_32 = _mm_setzero_si128();
-    __m128i msa_32 = _mm_setzero_si128();
-    const __m128i z = _mm_setzero_si128();
-
-    // Error is accumulated as a 32 bit value.  Looking at HD content with a
-    // height of 1080 lines, or about 67 macro blocks.  If the 16 bit row
-    // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which
-    // will not roll over a 32 bit accumulator.
-    // _skipNum is also used to reduce the number of rows
-    for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum)
-    {
-        __m128i se_16  = _mm_setzero_si128();
-        __m128i sev_16 = _mm_setzero_si128();
-        __m128i seh_16 = _mm_setzero_si128();
-        __m128i msa_16 = _mm_setzero_si128();
-
-        // Row error is accumulated as a 16 bit value.  There are 8
-        // accumulators.  Max value of a 16 bit number is 65529.  Looking
-        // at HD content, 1080p, has a width of 1920, 120 macro blocks.
-        // A mb at a time is processed at a time.  Absolute max error at
-        // a point would be abs(0-255+255+255+255) which equals 1020.
-        // 120*1020 = 122400.  The probability of hitting this is quite low
-        // on well behaved content.  A specially crafted image could roll over.
-        // _border could also be adjusted to concentrate on just the center of
-        // the images for an HD capture in order to reduce the possiblity of
-        // rollover.
-        const WebRtc_UWord8 *lineTop = imgBuf - _width + _border;
-        const WebRtc_UWord8 *lineCen = imgBuf + _border;
-        const WebRtc_UWord8 *lineBot = imgBuf + _width + _border;
-
-        for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16)
-        {
-            const __m128i t = _mm_loadu_si128((__m128i*)(lineTop));
-            const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1));
-            const __m128i c = _mm_loadu_si128((__m128i*)(lineCen));
-            const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1));
-            const __m128i b = _mm_loadu_si128((__m128i*)(lineBot));
-
-            lineTop += 16;
-            lineCen += 16;
-            lineBot += 16;
-
-            // center pixel unpacked
-            __m128i clo = _mm_unpacklo_epi8(c,z);
-            __m128i chi = _mm_unpackhi_epi8(c,z);
-
-            // left right pixels unpacked and added together
-            const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z),
-                                               _mm_unpacklo_epi8(r,z));
-            const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z),
-                                               _mm_unpackhi_epi8(r,z));
-
-            // top & bottom pixels unpacked and added together
-            const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z),
-                                               _mm_unpacklo_epi8(b,z));
-            const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z),
-                                               _mm_unpackhi_epi8(b,z));
-
-            // running sum of all pixels
-            msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo));
-
-            clo = _mm_slli_epi16(clo, 1);
-            chi = _mm_slli_epi16(chi, 1);
-            const __m128i sevtlo = _mm_subs_epi16(clo, tblo);
-            const __m128i sevthi = _mm_subs_epi16(chi, tbhi);
-            const __m128i sehtlo = _mm_subs_epi16(clo, lrlo);
-            const __m128i sehthi = _mm_subs_epi16(chi, lrhi);
-
-            clo = _mm_slli_epi16(clo, 1);
-            chi = _mm_slli_epi16(chi, 1);
-            const __m128i setlo = _mm_subs_epi16(clo,
-                                                 _mm_add_epi16(lrlo, tblo));
-            const __m128i sethi = _mm_subs_epi16(chi,
-                                                 _mm_add_epi16(lrhi, tbhi));
-
-            // Add to 16 bit running sum
-            se_16  = _mm_add_epi16(se_16,
-                                   _mm_max_epi16(setlo,
-                                                 _mm_subs_epi16(z, setlo)));
-            se_16  = _mm_add_epi16(se_16,
-                                   _mm_max_epi16(sethi,
-                                                 _mm_subs_epi16(z, sethi)));
-            sev_16 = _mm_add_epi16(sev_16,
-                                   _mm_max_epi16(sevtlo,
-                                                 _mm_subs_epi16(z, sevtlo)));
-            sev_16 = _mm_add_epi16(sev_16,
-                                   _mm_max_epi16(sevthi,
-                                                 _mm_subs_epi16(z, sevthi)));
-            seh_16 = _mm_add_epi16(seh_16,
-                                   _mm_max_epi16(sehtlo,
-                                                 _mm_subs_epi16(z, sehtlo)));
-            seh_16 = _mm_add_epi16(seh_16,
-                                   _mm_max_epi16(sehthi,
-                                                 _mm_subs_epi16(z, sehthi)));
-        }
-
-        // Add to 32 bit running sum as to not roll over.
-        se_32  = _mm_add_epi32(se_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(se_16,z),
-                                             _mm_unpacklo_epi16(se_16,z)));
-        sev_32 = _mm_add_epi32(sev_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(sev_16,z),
-                                             _mm_unpacklo_epi16(sev_16,z)));
-        seh_32 = _mm_add_epi32(seh_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(seh_16,z),
-                                             _mm_unpacklo_epi16(seh_16,z)));
-        msa_32 = _mm_add_epi32(msa_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(msa_16,z),
-                                             _mm_unpacklo_epi16(msa_16,z)));
-
-        imgBuf += _width * _skipNum;
-    }
-
-    WebRtc_Word64 se_64[2];
-    WebRtc_Word64 sev_64[2];
-    WebRtc_Word64 seh_64[2];
-    WebRtc_Word64 msa_64[2];
-
-    // bring sums out of vector registers and into integer register
-    // domain, summing them along the way
-    _mm_store_si128 ((__m128i*)se_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(se_32,z),
-                                   _mm_unpacklo_epi32(se_32,z)));
-    _mm_store_si128 ((__m128i*)sev_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(sev_32,z),
-                                   _mm_unpacklo_epi32(sev_32,z)));
-    _mm_store_si128 ((__m128i*)seh_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(seh_32,z),
-                                   _mm_unpacklo_epi32(seh_32,z)));
-    _mm_store_si128 ((__m128i*)msa_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(msa_32,z),
-                                   _mm_unpacklo_epi32(msa_32,z)));
-
-    const WebRtc_UWord32 spatialErrSum  = se_64[0] + se_64[1];
-    const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1];
-    const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1];
-    const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1];
-
-    // normalize over all pixels
-    const float spatialErr  = (float)(spatialErrSum >> 2);
-    const float spatialErrH = (float)(spatialErrHSum >> 1);
-    const float spatialErrV = (float)(spatialErrVSum >> 1);
-    const float norm = (float)pixelMSA;
-
-    // 2X2:
-    _spatialPredErr = spatialErr / norm;
-
-    // 1X2:
-    _spatialPredErrH = spatialErrH / norm;
-
-    // 2X1:
-    _spatialPredErrV = spatialErrV / norm;
-
-    return VPM_OK;
-}
-#endif // #if defined(WEBRTC_USE_SSE2)
-
 VideoContentMetrics*
 VPMContentAnalysis::ContentMetrics()
 {
diff --git a/src/modules/video_processing/main/source/content_analysis.h b/src/modules/video_processing/main/source/content_analysis.h
index e0810d3..5051650 100644
--- a/src/modules/video_processing/main/source/content_analysis.h
+++ b/src/modules/video_processing/main/source/content_analysis.h
@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-/*
- * content_analysis.h
- */
-
 #ifndef VPM_CONTENT_ANALYSIS_H
 #define VPM_CONTENT_ANALYSIS_H
 
@@ -24,7 +20,9 @@
 class VPMContentAnalysis
 {
 public:
-    VPMContentAnalysis(bool RTCD = true);
+    // When |runtime_cpu_detection| is true, runtime selection of an optimized
+    // code path is allowed.
+    VPMContentAnalysis(bool runtime_cpu_detection);
     ~VPMContentAnalysis();
 
     // Initialize ContentAnalysis - should be called prior to
@@ -62,7 +60,7 @@
     ComputeSpatialMetricsFunc ComputeSpatialMetrics;
     WebRtc_Word32 ComputeSpatialMetrics_C();
 
-#if defined(WEBRTC_USE_SSE2)
+#if defined(WEBRTC_ARCH_X86_FAMILY)
     WebRtc_Word32 ComputeSpatialMetrics_SSE2();
     WebRtc_Word32 TemporalDiffMetric_SSE2();
 #endif
diff --git a/src/modules/video_processing/main/source/content_analysis_sse2.cc b/src/modules/video_processing/main/source/content_analysis_sse2.cc
new file mode 100644
index 0000000..347fa5b
--- /dev/null
+++ b/src/modules/video_processing/main/source/content_analysis_sse2.cc
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "content_analysis.h"
+
+#include <emmintrin.h>
+#include <math.h>
+
+namespace webrtc {
+
+WebRtc_Word32
+VPMContentAnalysis::TemporalDiffMetric_SSE2()
+{
+    WebRtc_UWord32 numPixels = 0;       // counter for # of pixels
+
+    const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border;
+    const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border;
+
+    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
+
+    __m128i sad_64   = _mm_setzero_si128();
+    __m128i sum_64   = _mm_setzero_si128();
+    __m128i sqsum_64 = _mm_setzero_si128();
+    const __m128i z  = _mm_setzero_si128();
+
+    for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum)
+    {
+        __m128i sqsum_32  = _mm_setzero_si128();
+
+        const WebRtc_UWord8 *lineO = imgBufO;
+        const WebRtc_UWord8 *lineP = imgBufP;
+
+        // Work on 16 pixels at a time.  For HD content with a width of 1920
+        // this loop will run ~67 times (depending on border).  Maximum for
+        // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit
+        // results which are then accumulated.  There is no chance of
+        // rollover for these two accumulators.
+        // o*o will have a maximum of 255*255 = 65025.  This will roll over
+        // a 16 bit accumulator as 67*65025 > 65535, but will fit in a
+        // 32 bit accumulator.
+        for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16)
+        {
+            const __m128i o = _mm_loadu_si128((__m128i*)(lineO));
+            const __m128i p = _mm_loadu_si128((__m128i*)(lineP));
+
+            lineO += 16;
+            lineP += 16;
+
+            // abs pixel difference between frames
+            sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p));
+
+            // sum of all pixels in frame
+            sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z));
+
+            // squared sum of all pixels in frame
+            const __m128i olo = _mm_unpacklo_epi8(o,z);
+            const __m128i ohi = _mm_unpackhi_epi8(o,z);
+
+            const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo);
+            const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi);
+
+            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo);
+            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi);
+        }
+
+        // Add to 64 bit running sum as to not roll over.
+        sqsum_64 = _mm_add_epi64(sqsum_64,
+                                _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z),
+                                              _mm_unpacklo_epi32(sqsum_32,z)));
+
+        imgBufO += _width * _skipNum;
+        imgBufP += _width * _skipNum;
+        numPixels += (width_end - _border);
+    }
+
+    WebRtc_Word64 sad_final_64[2];
+    WebRtc_Word64 sum_final_64[2];
+    WebRtc_Word64 sqsum_final_64[2];
+
+    // bring sums out of vector registers and into integer register
+    // domain, summing them along the way
+    _mm_store_si128 ((__m128i*)sad_final_64, sad_64);
+    _mm_store_si128 ((__m128i*)sum_final_64, sum_64);
+    _mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64);
+
+    const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1];
+    const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1];
+    const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1];
+
+    // default
+    _motionMagnitudeNZ = 0.0f;
+
+    if (tempDiffSum == 0)
+    {
+        return VPM_OK;
+    }
+
+    // normalize over all pixels
+    const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels);
+    const float pixelSumAvg = (float)pixelSum / (float)(numPixels);
+    const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels);
+    float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg);
+
+    if (contrast > 0.0)
+    {
+        contrast = sqrt(contrast);
+       _motionMagnitudeNZ = tempDiffAvg/contrast;
+    }
+
+    return VPM_OK;
+}
+
+WebRtc_Word32
+VPMContentAnalysis::ComputeSpatialMetrics_SSE2()
+{
+    const WebRtc_UWord8* imgBuf = _origFrame + _border*_width;
+    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
+
+    __m128i se_32  = _mm_setzero_si128();
+    __m128i sev_32 = _mm_setzero_si128();
+    __m128i seh_32 = _mm_setzero_si128();
+    __m128i msa_32 = _mm_setzero_si128();
+    const __m128i z = _mm_setzero_si128();
+
+    // Error is accumulated as a 32 bit value.  Looking at HD content with a
+    // height of 1080 lines, or about 67 macro blocks.  If the 16 bit row
+    // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which
+    // will not roll over a 32 bit accumulator.
+    // _skipNum is also used to reduce the number of rows
+    for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum)
+    {
+        __m128i se_16  = _mm_setzero_si128();
+        __m128i sev_16 = _mm_setzero_si128();
+        __m128i seh_16 = _mm_setzero_si128();
+        __m128i msa_16 = _mm_setzero_si128();
+
+        // Row error is accumulated as a 16 bit value.  There are 8
+        // accumulators.  Max value of a 16 bit number is 65529.  Looking
+        // at HD content, 1080p, has a width of 1920, 120 macro blocks.
+        // A mb at a time is processed at a time.  Absolute max error at
+        // a point would be abs(0-255+255+255+255) which equals 1020.
+        // 120*1020 = 122400.  The probability of hitting this is quite low
+        // on well behaved content.  A specially crafted image could roll over.
+        // _border could also be adjusted to concentrate on just the center of
+        // the images for an HD capture in order to reduce the possiblity of
+        // rollover.
+        const WebRtc_UWord8 *lineTop = imgBuf - _width + _border;
+        const WebRtc_UWord8 *lineCen = imgBuf + _border;
+        const WebRtc_UWord8 *lineBot = imgBuf + _width + _border;
+
+        for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16)
+        {
+            const __m128i t = _mm_loadu_si128((__m128i*)(lineTop));
+            const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1));
+            const __m128i c = _mm_loadu_si128((__m128i*)(lineCen));
+            const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1));
+            const __m128i b = _mm_loadu_si128((__m128i*)(lineBot));
+
+            lineTop += 16;
+            lineCen += 16;
+            lineBot += 16;
+
+            // center pixel unpacked
+            __m128i clo = _mm_unpacklo_epi8(c,z);
+            __m128i chi = _mm_unpackhi_epi8(c,z);
+
+            // left right pixels unpacked and added together
+            const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z),
+                                               _mm_unpacklo_epi8(r,z));
+            const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z),
+                                               _mm_unpackhi_epi8(r,z));
+
+            // top & bottom pixels unpacked and added together
+            const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z),
+                                               _mm_unpacklo_epi8(b,z));
+            const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z),
+                                               _mm_unpackhi_epi8(b,z));
+
+            // running sum of all pixels
+            msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo));
+
+            clo = _mm_slli_epi16(clo, 1);
+            chi = _mm_slli_epi16(chi, 1);
+            const __m128i sevtlo = _mm_subs_epi16(clo, tblo);
+            const __m128i sevthi = _mm_subs_epi16(chi, tbhi);
+            const __m128i sehtlo = _mm_subs_epi16(clo, lrlo);
+            const __m128i sehthi = _mm_subs_epi16(chi, lrhi);
+
+            clo = _mm_slli_epi16(clo, 1);
+            chi = _mm_slli_epi16(chi, 1);
+            const __m128i setlo = _mm_subs_epi16(clo,
+                                                 _mm_add_epi16(lrlo, tblo));
+            const __m128i sethi = _mm_subs_epi16(chi,
+                                                 _mm_add_epi16(lrhi, tbhi));
+
+            // Add to 16 bit running sum
+            se_16  = _mm_add_epi16(se_16,
+                                   _mm_max_epi16(setlo,
+                                                 _mm_subs_epi16(z, setlo)));
+            se_16  = _mm_add_epi16(se_16,
+                                   _mm_max_epi16(sethi,
+                                                 _mm_subs_epi16(z, sethi)));
+            sev_16 = _mm_add_epi16(sev_16,
+                                   _mm_max_epi16(sevtlo,
+                                                 _mm_subs_epi16(z, sevtlo)));
+            sev_16 = _mm_add_epi16(sev_16,
+                                   _mm_max_epi16(sevthi,
+                                                 _mm_subs_epi16(z, sevthi)));
+            seh_16 = _mm_add_epi16(seh_16,
+                                   _mm_max_epi16(sehtlo,
+                                                 _mm_subs_epi16(z, sehtlo)));
+            seh_16 = _mm_add_epi16(seh_16,
+                                   _mm_max_epi16(sehthi,
+                                                 _mm_subs_epi16(z, sehthi)));
+        }
+
+        // Add to 32 bit running sum as to not roll over.
+        se_32  = _mm_add_epi32(se_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(se_16,z),
+                                             _mm_unpacklo_epi16(se_16,z)));
+        sev_32 = _mm_add_epi32(sev_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(sev_16,z),
+                                             _mm_unpacklo_epi16(sev_16,z)));
+        seh_32 = _mm_add_epi32(seh_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(seh_16,z),
+                                             _mm_unpacklo_epi16(seh_16,z)));
+        msa_32 = _mm_add_epi32(msa_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(msa_16,z),
+                                             _mm_unpacklo_epi16(msa_16,z)));
+
+        imgBuf += _width * _skipNum;
+    }
+
+    WebRtc_Word64 se_64[2];
+    WebRtc_Word64 sev_64[2];
+    WebRtc_Word64 seh_64[2];
+    WebRtc_Word64 msa_64[2];
+
+    // bring sums out of vector registers and into integer register
+    // domain, summing them along the way
+    _mm_store_si128 ((__m128i*)se_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(se_32,z),
+                                   _mm_unpacklo_epi32(se_32,z)));
+    _mm_store_si128 ((__m128i*)sev_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(sev_32,z),
+                                   _mm_unpacklo_epi32(sev_32,z)));
+    _mm_store_si128 ((__m128i*)seh_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(seh_32,z),
+                                   _mm_unpacklo_epi32(seh_32,z)));
+    _mm_store_si128 ((__m128i*)msa_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(msa_32,z),
+                                   _mm_unpacklo_epi32(msa_32,z)));
+
+    const WebRtc_UWord32 spatialErrSum  = se_64[0] + se_64[1];
+    const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1];
+    const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1];
+    const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1];
+
+    // normalize over all pixels
+    const float spatialErr  = (float)(spatialErrSum >> 2);
+    const float spatialErrH = (float)(spatialErrHSum >> 1);
+    const float spatialErrV = (float)(spatialErrVSum >> 1);
+    const float norm = (float)pixelMSA;
+
+    // 2X2:
+    _spatialPredErr = spatialErr / norm;
+
+    // 1X2:
+    _spatialPredErrH = spatialErrH / norm;
+
+    // 2X1:
+    _spatialPredErrV = spatialErrV / norm;
+
+    return VPM_OK;
+}
+
+}  // namespace webrtc
diff --git a/src/modules/video_processing/main/source/frame_preprocessor.cc b/src/modules/video_processing/main/source/frame_preprocessor.cc
index 57bc84d..14ced41 100644
--- a/src/modules/video_processing/main/source/frame_preprocessor.cc
+++ b/src/modules/video_processing/main/source/frame_preprocessor.cc
@@ -22,7 +22,7 @@
 _enableCA(false)
 {
     _spatialResampler = new VPMSimpleSpatialResampler();
-    _ca = new VPMContentAnalysis();
+    _ca = new VPMContentAnalysis(true);
     _vd = new VPMVideoDecimator();
 }
 
diff --git a/src/modules/video_processing/main/source/video_processing.gypi b/src/modules/video_processing/main/source/video_processing.gypi
index 8ca831d..dda0a78 100644
--- a/src/modules/video_processing/main/source/video_processing.gypi
+++ b/src/modules/video_processing/main/source/video_processing.gypi
@@ -14,7 +14,7 @@
       'dependencies': [
         'webrtc_utility',
         '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
-         '<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv',
+        '<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv',
         '<(webrtc_root)/system_wrappers/source/system_wrappers.gyp:system_wrappers',
       ],
       'include_dirs': [
@@ -26,41 +26,57 @@
         ],
       },
       'sources': [
-        # interfaces
         '../interface/video_processing.h',
         '../interface/video_processing_defines.h',
-
-        # headers
-        'video_processing_impl.h',
+        'brighten.cc',
+        'brighten.h',
+        'brightness_detection.cc',
         'brightness_detection.h',
-	'brighten.h',
+        'color_enhancement.cc',
         'color_enhancement.h',
         'color_enhancement_private.h',
-        'content_analysis.h',
-        'deflickering.h',
-        'denoising.h',
-        'frame_preprocessor.h',
-        'spatial_resampler.h',
-        'video_decimator.h',
-
-        # sources
-        'video_processing_impl.cc',
-        'brightness_detection.cc',
-	'brighten.cc',
-        'color_enhancement.cc',
         'content_analysis.cc',
+        'content_analysis.h',
         'deflickering.cc',
+        'deflickering.h',
         'denoising.cc',
+        'denoising.h',
         'frame_preprocessor.cc',
+        'frame_preprocessor.h',
         'spatial_resampler.cc',
+        'spatial_resampler.h',
         'video_decimator.cc',
-      ], # source
+        'video_decimator.h',
+        'video_processing_impl.cc',
+        'video_processing_impl.h',
+      ],
+      'conditions': [
+        ['target_arch=="ia32" or target_arch=="x64"', {
+          'dependencies': [ 'video_processing_sse2', ],
+        }],
+      ],
+    },
+    {
+      'target_name': 'video_processing_sse2',
+      'type': '<(library)',
+      'sources': [
+        'content_analysis_sse2.cc',
+      ],
+      'include_dirs': [
+        '../interface',
+        '../../../interface',
+      ],
+      'conditions': [
+        ['os_posix==1 and OS!="mac"', {
+          'cflags': [ '-msse2', ],
+        }],
+        ['OS=="mac"', {
+          'xcode_settings': {
+            'OTHER_CFLAGS': [ '-msse2', ],
+          },
+        }],
+      ],
     },
   ],
 }
 
-# Local Variables:
-# tab-width:2
-# indent-tabs-mode:nil
-# End:
-# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc b/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc
index 20e803c..b25c45f 100644
--- a/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc
+++ b/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc
@@ -17,7 +17,7 @@
 TEST_F(VideoProcessingModuleTest, ContentAnalysis)
 {
     VPMContentAnalysis    _ca_c(false);
-    VPMContentAnalysis    _ca_sse;
+    VPMContentAnalysis    _ca_sse(true);
     VideoContentMetrics  *_cM_c, *_cM_SSE;
 
     _ca_c.Initialize(_width,_height);