libyuv r1645 to fix android build warnings r1602 under android.mk had unused parameter build warnings. The warnings were disabled. This CL fixes the source and re-enables the warning. Bug: 35099807 Test: mm for libyuv builds cleanly. Change-Id: If6b344ca39b2c321e277421cdeb817a5b1cc2514

commit: b83bb38f0a92bedeb52baa31e515220927ef53bb [log] [tgz]
author: Frank Barchard <fbarchard@google.com> Wed Feb 22 18:01:07 2017 -0800
committer: Frank Barchard <fbarchard@google.com> Mon Mar 06 09:54:15 2017 -0800
tree: a31c9da19db3f909cad22293ad2964d1c41c953a
parent: 04676c9f110180a5ae1fa259a38fab17101c6b5b [diff]
diff --git a/files/source/compare.cc b/files/source/compare.cc
index e3846bd..1facd27 100644
--- a/files/source/compare.cc
+++ b/files/source/compare.cc

@@ -32,8 +32,7 @@
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
   const int kBlockSize = 1 << 15;  // 32768;
   int remainder;
-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
-      HashDjb2_C;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
   if (TestCpuFlag(kCpuHasSSE41)) {
     HashDjb2_SSE = HashDjb2_SSE41;
@@ -50,13 +49,13 @@
     src += kBlockSize;
     count -= kBlockSize;
   }
-  remainder = (int)(count) & ~15;
+  remainder = (int)count & ~15;
   if (remainder) {
     seed = HashDjb2_SSE(src, remainder, seed);
     src += remainder;
     count -= remainder;
   }
-  remainder = (int)(count) & 15;
+  remainder = (int)count & 15;
   if (remainder) {
     seed = HashDjb2_C(src, remainder, seed);
   }
@@ -113,7 +112,8 @@
 
 // TODO(fbarchard): Refactor into row function.
 LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+uint64 ComputeSumSquareError(const uint8* src_a,
+                             const uint8* src_b,
                              int count) {
   // SumSquareError returns values 0 to 65535 for each squared difference.
   // Up to 65536 of those can be summed and remain within a uint32.
@@ -142,7 +142,7 @@
   }
 #endif
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: sse)
+#pragma omp parallel for reduction(+ : sse)
 #endif
   for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
     sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
@@ -162,14 +162,16 @@
 }
 
 LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
-                                  const uint8* src_b, int stride_b,
-                                  int width, int height) {
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
+                                  int stride_a,
+                                  const uint8* src_b,
+                                  int stride_b,
+                                  int width,
+                                  int height) {
   uint64 sse = 0;
   int h;
   // Coalesce rows.
-  if (stride_a == width &&
-      stride_b == width) {
+  if (stride_a == width && stride_b == width) {
     width *= height;
     height = 1;
     stride_a = stride_b = 0;
@@ -186,10 +188,10 @@
 double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
   double psnr;
   if (sse > 0) {
-    double mse = (double)(count) / (double)(sse);
+    double mse = (double)count / (double)sse;
     psnr = 10.0 * log10(255.0 * 255.0 * mse);
   } else {
-    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+    psnr = kMaxPsnr;  // Limit to prevent divide by 0
   }
 
   if (psnr > kMaxPsnr)
@@ -199,45 +201,53 @@
 }
 
 LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
+double CalcFramePsnr(const uint8* src_a,
+                     int stride_a,
+                     const uint8* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
   const uint64 samples = width * height;
-  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
-                                                src_b, stride_b,
-                                                width, height);
+  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
+                                                stride_b, width, height);
   return SumSquareErrorToPsnr(sse, samples);
 }
 
 LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
-                                                  src_y_b, stride_y_b,
-                                                  width, height);
+double I420Psnr(const uint8* src_y_a,
+                int stride_y_a,
+                const uint8* src_u_a,
+                int stride_u_a,
+                const uint8* src_v_a,
+                int stride_v_a,
+                const uint8* src_y_b,
+                int stride_y_b,
+                const uint8* src_u_b,
+                int stride_u_b,
+                const uint8* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, src_y_b,
+                                                  stride_y_b, width, height);
   const int width_uv = (width + 1) >> 1;
   const int height_uv = (height + 1) >> 1;
-  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
-                                                  src_u_b, stride_u_b,
-                                                  width_uv, height_uv);
-  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
-                                                  src_v_b, stride_v_b,
-                                                  width_uv, height_uv);
+  const uint64 sse_u = ComputeSumSquareErrorPlane(
+      src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
+  const uint64 sse_v = ComputeSumSquareErrorPlane(
+      src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
   const uint64 samples = width * height + 2 * (width_uv * height_uv);
   const uint64 sse = sse_y + sse_u + sse_v;
   return SumSquareErrorToPsnr(sse, samples);
 }
 
-static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64 cc1 = 26634;   // (64^2*(.01*255)^2
 static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
 
-static double Ssim8x8_C(const uint8* src_a, int stride_a,
-                        const uint8* src_b, int stride_b) {
+static double Ssim8x8_C(const uint8* src_a,
+                        int stride_a,
+                        const uint8* src_b,
+                        int stride_b) {
   int64 sum_a = 0;
   int64 sum_b = 0;
   int64 sum_sq_a = 0;
@@ -270,12 +280,12 @@
     const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
                          (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
 
-    const int64 sum_a_sq = sum_a*sum_a;
-    const int64 sum_b_sq = sum_b*sum_b;
+    const int64 sum_a_sq = sum_a * sum_a;
+    const int64 sum_b_sq = sum_b * sum_b;
 
-    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
-                         (count * sum_sq_a - sum_a_sq +
-                          count * sum_sq_b - sum_b_sq + c2);
+    const int64 ssim_d =
+        (sum_a_sq + sum_b_sq + c1) *
+        (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
 
     if (ssim_d == 0.0) {
       return DBL_MAX;
@@ -288,13 +298,16 @@
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
 LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
+double CalcFrameSsim(const uint8* src_a,
+                     int stride_a,
+                     const uint8* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
   int samples = 0;
   double ssim_total = 0;
-  double (*Ssim8x8)(const uint8* src_a, int stride_a,
-                    const uint8* src_b, int stride_b) = Ssim8x8_C;
+  double (*Ssim8x8)(const uint8* src_a, int stride_a, const uint8* src_b,
+                    int stride_b) = Ssim8x8_C;
 
   // sample point start with each 4x4 location
   int i;
@@ -314,22 +327,27 @@
 }
 
 LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
-                                      src_y_b, stride_y_b, width, height);
+double I420Ssim(const uint8* src_y_a,
+                int stride_y_a,
+                const uint8* src_u_a,
+                int stride_u_a,
+                const uint8* src_v_a,
+                int stride_v_a,
+                const uint8* src_y_b,
+                int stride_y_b,
+                const uint8* src_u_b,
+                int stride_u_b,
+                const uint8* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const double ssim_y =
+      CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
   const int width_uv = (width + 1) >> 1;
   const int height_uv = (height + 1) >> 1;
-  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
-                                      src_u_b, stride_u_b,
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
                                       width_uv, height_uv);
-  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
-                                      src_v_b, stride_v_b,
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
                                       width_uv, height_uv);
   return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
 }

diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc
index 1b83edb..64522aa 100644
--- a/files/source/compare_gcc.cc
+++ b/files/source/compare_gcc.cc

@@ -62,30 +62,30 @@
   return sse;
 }
 
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
 static uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
 };
 static uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
 };
 static uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
 };
 static uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
 };
 
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
@@ -148,4 +148,3 @@
 }  // extern "C"
 }  // namespace libyuv
 #endif
-

diff --git a/files/source/compare_win.cc b/files/source/compare_win.cc
index dc86fe2..b17fc8e 100644
--- a/files/source/compare_win.cc
+++ b/files/source/compare_win.cc

@@ -21,12 +21,12 @@
 // This module is for 32 bit Visual C x86 and clangcl
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 
-__declspec(naked)
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+__declspec(naked) uint32
+    SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
   __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
     pxor       xmm0, xmm0
     pxor       xmm5, xmm5
 
@@ -61,13 +61,13 @@
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked)
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+#pragma warning(disable : 4752)
+__declspec(naked) uint32
+    SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
   __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
     vpxor      ymm0, ymm0, ymm0  // sum
     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     sub        edx, eax
@@ -101,65 +101,65 @@
 }
 #endif  // _MSC_VER >= 1700
 
-uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
 uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
 };
 uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
 };
 uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
 };
 uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
 };
 
-__declspec(naked)
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32
+    HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
   __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
     movd       xmm0, [esp + 12]  // seed
 
-    pxor       xmm7, xmm7        // constant 0 for unpck
+    pxor       xmm7, xmm7  // constant 0 for unpck
     movdqa     xmm6, xmmword ptr kHash16x33
 
   wloop:
-    movdqu     xmm1, [eax]       // src[0-15]
+    movdqu     xmm1, [eax]  // src[0-15]
     lea        eax, [eax + 16]
-    pmulld     xmm0, xmm6        // hash *= 33 ^ 16
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
     movdqa     xmm5, xmmword ptr kHashMul0
     movdqa     xmm2, xmm1
-    punpcklbw  xmm2, xmm7        // src[0-7]
+    punpcklbw  xmm2, xmm7  // src[0-7]
     movdqa     xmm3, xmm2
-    punpcklwd  xmm3, xmm7        // src[0-3]
+    punpcklwd  xmm3, xmm7  // src[0-3]
     pmulld     xmm3, xmm5
     movdqa     xmm5, xmmword ptr kHashMul1
     movdqa     xmm4, xmm2
-    punpckhwd  xmm4, xmm7        // src[4-7]
+    punpckhwd  xmm4, xmm7  // src[4-7]
     pmulld     xmm4, xmm5
     movdqa     xmm5, xmmword ptr kHashMul2
-    punpckhbw  xmm1, xmm7        // src[8-15]
+    punpckhbw  xmm1, xmm7  // src[8-15]
     movdqa     xmm2, xmm1
-    punpcklwd  xmm2, xmm7        // src[8-11]
+    punpcklwd  xmm2, xmm7  // src[8-11]
     pmulld     xmm2, xmm5
     movdqa     xmm5, xmmword ptr kHashMul3
-    punpckhwd  xmm1, xmm7        // src[12-15]
+    punpckhwd  xmm1, xmm7  // src[12-15]
     pmulld     xmm1, xmm5
-    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm3, xmm4  // add 16 results
     paddd      xmm1, xmm2
     paddd      xmm1, xmm3
 
@@ -171,18 +171,18 @@
     sub        ecx, 16
     jg         wloop
 
-    movd       eax, xmm0         // return hash
+    movd       eax, xmm0  // return hash
     ret
   }
 }
 
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
-__declspec(naked)
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32
+    HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
   __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
     vmovd      xmm0, [esp + 12]  // seed
 
   wloop:
@@ -196,7 +196,7 @@
     vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
     lea        eax, [eax + 16]
     vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
-    vpaddd     xmm3, xmm3, xmm4        // add 16 results
+    vpaddd     xmm3, xmm3, xmm4  // add 16 results
     vpaddd     xmm1, xmm1, xmm2
     vpaddd     xmm1, xmm1, xmm3
     vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
@@ -207,7 +207,7 @@
     sub        ecx, 16
     jg         wloop
 
-    vmovd      eax, xmm0         // return hash
+    vmovd      eax, xmm0  // return hash
     vzeroupper
     ret
   }

diff --git a/files/source/convert.cc b/files/source/convert.cc
index e332bc5..f79acac 100644
--- a/files/source/convert.cc
+++ b/files/source/convert.cc

@@ -28,31 +28,37 @@
 }
 
 // Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int src_uv_width, int src_uv_height) {
+static int I4xxToI420(const uint8* src_y,
+                      int src_stride_y,
+                      const uint8* src_u,
+                      int src_stride_u,
+                      const uint8* src_v,
+                      int src_stride_v,
+                      uint8* dst_y,
+                      int dst_stride_y,
+                      uint8* dst_u,
+                      int dst_stride_u,
+                      uint8* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int src_uv_width,
+                      int src_uv_height) {
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
   const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 ||
-      src_uv_width == 0 || src_uv_height == 0) {
+  if (src_uv_width == 0 || src_uv_height == 0) {
     return -1;
   }
-  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-             dst_y, dst_stride_y, dst_y_width, dst_y_height,
-             kFilterBilinear);
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+  }
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
   return 0;
 }
 
@@ -60,18 +66,23 @@
 // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
 // is does row coalescing.
 LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
+int I420Copy(const uint8* src_y,
+             int src_stride_y,
+             const uint8* src_u,
+             int src_stride_u,
+             const uint8* src_v,
+             int src_stride_v,
+             uint8* dst_y,
+             int dst_stride_y,
+             uint8* dst_u,
+             int dst_stride_u,
+             uint8* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -98,76 +109,63 @@
 // 422 chroma is 1/2 width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I422ToI420(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int src_uv_width = SUBSAMPLE(width, 1, 1);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, src_uv_width, height);
 }
 
 // 444 chroma is 1x width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    width, height);
-}
-
-// 411 chroma is 1/4 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int src_uv_width = SUBSAMPLE(width, 3, 2);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
+int I444ToI420(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, width, height);
 }
 
 // I400 is greyscale typically used in MJPG
 LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I400ToI420(const uint8* src_y,
+               int src_stride_y,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -177,15 +175,21 @@
     src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
   SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
   SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
   return 0;
 }
 
-static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
-                       uint8* dst, int dst_stride,
-                       int width, int height) {
+static void CopyPlane2(const uint8* src,
+                       int src_stride_0,
+                       int src_stride_1,
+                       uint8* dst,
+                       int dst_stride,
+                       int width,
+                       int height) {
   int y;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_SSE2)
@@ -236,27 +240,30 @@
 //   The UV plane is half width, but 2 values, so src_stride_m420 applies to
 //   this as well as the two Y planes.
 static int X420ToI420(const uint8* src_y,
-                      int src_stride_y0, int src_stride_y1,
-                      const uint8* src_uv, int src_stride_uv,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int width, int height) {
-  int y;
+                      int src_stride_y0,
+                      int src_stride_y1,
+                      const uint8* src_uv,
+                      int src_stride_uv,
+                      uint8* dst_y,
+                      int dst_stride_y,
+                      uint8* dst_u,
+                      int dst_stride_u,
+                      uint8* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) = SplitUVRow_C;
-  if (!src_y || !src_uv ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
+    if (dst_y) {
+      dst_y = dst_y + (height - 1) * dst_stride_y;
+    }
     dst_u = dst_u + (halfheight - 1) * dst_stride_u;
     dst_v = dst_v + (halfheight - 1) * dst_stride_v;
     dst_stride_y = -dst_stride_y;
@@ -264,56 +271,19 @@
     dst_stride_v = -dst_stride_v;
   }
   // Coalesce rows.
-  if (src_stride_y0 == width &&
-      src_stride_y1 == width &&
+  if (src_stride_y0 == width && src_stride_y1 == width &&
       dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
   }
   // Coalesce rows.
-  if (src_stride_uv == halfwidth * 2 &&
-      dst_stride_u == halfwidth &&
+  if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
       dst_stride_v == halfwidth) {
     halfwidth *= halfheight;
     halfheight = 1;
     src_stride_uv = dst_stride_u = dst_stride_v = 0;
   }
-#if defined(HAS_SPLITUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    SplitUVRow = SplitUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      SplitUVRow = SplitUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_SPLITUVROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
-      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
-      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
-    SplitUVRow = SplitUVRow_Any_DSPR2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_DSPR2;
-    }
-  }
-#endif
 
   if (dst_y) {
     if (src_stride_y0 == src_stride_y1) {
@@ -324,75 +294,86 @@
     }
   }
 
-  for (y = 0; y < halfheight; ++y) {
-    // Copy a row of UV.
-    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-    src_uv += src_stride_uv;
-  }
+  // Split UV plane - NV12 / NV21
+  SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
+               halfwidth, halfheight);
+
   return 0;
 }
 
 // Convert NV12 to I420.
 LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_uv, src_stride_uv,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height);
+int NV12ToI420(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_uv,
+               int src_stride_uv,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
+                    dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                    dst_stride_v, width, height);
 }
 
 // Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
 LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_vu, src_stride_vu,
-                    dst_y, dst_stride_y,
-                    dst_v, dst_stride_v,
-                    dst_u, dst_stride_u,
-                    width, height);
+int NV21ToI420(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_vu,
+               int src_stride_vu,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
+                    dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
+                    dst_stride_u, width, height);
 }
 
 // Convert M420 to I420.
 LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int M420ToI420(const uint8* src_m420,
+               int src_stride_m420,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
                     width, height);
 }
 
 // Convert YUY2 to I420.
 LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int YUY2ToI420(const uint8* src_yuy2,
+               int src_stride_yuy2,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
-      uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2,
-      uint8* dst_y, int width) = YUY2ToYRow_C;
+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u,
+                      uint8* dst_v, int width) = YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+      YUY2ToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -429,6 +410,16 @@
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUVRow = YUY2ToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
@@ -448,16 +439,21 @@
 
 // Convert UYVY to I420.
 LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int UYVYToI420(const uint8* src_uyvy,
+               int src_stride_uyvy,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
-      uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-      uint8* dst_y, int width) = UYVYToYRow_C;
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u,
+                      uint8* dst_v, int width) = UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) =
+      UYVYToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -494,6 +490,16 @@
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUVRow = UYVYToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUVRow = UYVYToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
@@ -513,19 +519,22 @@
 
 // Convert ARGB to I420.
 LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI420(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -570,6 +579,38 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -589,19 +630,22 @@
 
 // Convert BGRA to I420.
 LIBYUV_API
-int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int BGRAToI420(const uint8* src_bgra,
+               int src_stride_bgra,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
-      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, uint8* dst_u,
+                      uint8* dst_v, int width) = BGRAToUVRow_C;
   void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
       BGRAToYRow_C;
-  if (!src_bgra ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -629,12 +673,44 @@
   }
 #endif
 #if defined(HAS_BGRATOUVROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      BGRAToUVRow = BGRAToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        BGRAToUVRow = BGRAToUVRow_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToUVRow = BGRAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_NEON;
     }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    BGRAToYRow = BGRAToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    BGRAToUVRow = BGRAToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToYRow = BGRAToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToUVRow = BGRAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_MSA;
+    }
+  }
 #endif
 
   for (y = 0; y < height - 1; y += 2) {
@@ -655,19 +731,22 @@
 
 // Convert ABGR to I420.
 LIBYUV_API
-int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ABGRToI420(const uint8* src_abgr,
+               int src_stride_abgr,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
-      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, uint8* dst_u,
+                      uint8* dst_v, int width) = ABGRToUVRow_C;
   void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
       ABGRToYRow_C;
-  if (!src_abgr ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -702,6 +781,38 @@
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ABGRToYRow = ABGRToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -721,19 +832,22 @@
 
 // Convert RGBA to I420.
 LIBYUV_API
-int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int RGBAToI420(const uint8* src_rgba,
+               int src_stride_rgba,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
-      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, uint8* dst_u,
+                      uint8* dst_v, int width) = RGBAToUVRow_C;
   void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
       RGBAToYRow_C;
-  if (!src_rgba ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -768,6 +882,38 @@
     }
   }
 #endif
+#if defined(HAS_RGBATOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RGBAToYRow = RGBAToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RGBAToUVRow = RGBAToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYRow = RGBAToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToUVRow = RGBAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -787,27 +933,31 @@
 
 // Convert RGB24 to I420.
 LIBYUV_API
-int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height) {
+int RGB24ToI420(const uint8* src_rgb24,
+                int src_stride_rgb24,
+                uint8* dst_y,
+                int dst_stride_y,
+                uint8* dst_u,
+                int dst_stride_u,
+                uint8* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
   int y;
-#if defined(HAS_RGB24TOYROW_NEON)
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
   void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
-      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+                       uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
   void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
       RGB24ToYRow_C;
 #else
   void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -829,6 +979,15 @@
       }
     }
   }
+#elif defined(HAS_RGB24TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+    RGB24ToYRow = RGB24ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYRow = RGB24ToYRow_MSA;
+      RGB24ToUVRow = RGB24ToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
@@ -865,63 +1024,67 @@
     align_buffer_64(row, kRowSize * 2);
 #endif
 
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
-      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-      RGB24ToYRow(src_rgb24, dst_y, width);
-      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+  for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+    RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+    RGB24ToYRow(src_rgb24, dst_y, width);
+    RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
 #else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+    RGB24ToARGBRow(src_rgb24, row, width);
+    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-      src_rgb24 += src_stride_rgb24 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
-      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
-      RGB24ToYRow(src_rgb24, dst_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_RGB24TOYROW_NEON)
-    free_aligned_buffer_64(row);
+    src_rgb24 += src_stride_rgb24 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
   }
+  if (height & 1) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+    RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+    RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+    RGB24ToARGBRow(src_rgb24, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
 #endif
-  return 0;
+  }
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+  free_aligned_buffer_64(row);
+}
+#endif
+return 0;
 }
 
 // Convert RAW to I420.
 LIBYUV_API
-int RAWToI420(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height) {
+int RAWToI420(const uint8* src_raw,
+              int src_stride_raw,
+              uint8* dst_y,
+              int dst_stride_y,
+              uint8* dst_u,
+              int dst_stride_u,
+              uint8* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
   int y;
-#if defined(HAS_RAWTOYROW_NEON)
-  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
-      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, uint8* dst_u,
+                     uint8* dst_v, int width) = RAWToUVRow_C;
   void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
       RAWToYRow_C;
 #else
   void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
       RAWToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_raw || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -943,99 +1106,121 @@
       }
     }
   }
+#elif defined(HAS_RAWTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToUVRow = RAWToUVRow_Any_MSA;
+    RAWToYRow = RAWToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYRow = RAWToYRow_MSA;
+      RAWToUVRow = RAWToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else
 #if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+      RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToARGBRow = RAWToARGBRow_SSSE3;
+      }
     }
-  }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+      ARGBToYRow = ARGBToYRow_Any_SSSE3;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
     }
-  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+      ARGBToYRow = ARGBToYRow_Any_AVX2;
+      if (IS_ALIGNED(width, 32)) {
+        ARGBToUVRow = ARGBToUVRow_AVX2;
+        ARGBToYRow = ARGBToYRow_AVX2;
+      }
     }
-  }
 #endif
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+#if defined(HAS_RAWTOARGBROW_DSPR2)
+    if (TestCpuFlag(kCpuHasDSPR2)) {
+      RAWToARGBRow = RAWToARGBRow_Any_DSPR2;
+      if (IS_ALIGNED(width, 4)) {
+        RAWToARGBRow = RAWToARGBRow_DSPR2;
+      }
+    }
+#endif
+    {
+      // Allocate 2 rows of ARGB.
+      const int kRowSize = (width * 4 + 31) & ~31;
+      align_buffer_64(row, kRowSize * 2);
 #endif
 
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
-      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
-      RAWToYRow(src_raw, dst_y, width);
-      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+  for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+    RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+    RAWToYRow(src_raw, dst_y, width);
+    RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
 #else
-      RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+    RAWToARGBRow(src_raw, row, width);
+    RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-      src_raw += src_stride_raw * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
-      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
-      RAWToYRow(src_raw, dst_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_RAWTOYROW_NEON)
-    free_aligned_buffer_64(row);
+    src_raw += src_stride_raw * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
   }
+  if (height & 1) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+    RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+    RAWToYRow(src_raw, dst_y, width);
+#else
+    RAWToARGBRow(src_raw, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
 #endif
-  return 0;
+  }
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+  free_aligned_buffer_64(row);
+}
+#endif
+return 0;
 }
 
 // Convert RGB565 to I420.
 LIBYUV_API
-int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height) {
+int RGB565ToI420(const uint8* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8* dst_y,
+                 int dst_stride_y,
+                 uint8* dst_u,
+                 int dst_stride_u,
+                 uint8* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height) {
   int y;
-#if defined(HAS_RGB565TOYROW_NEON)
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
   void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
-      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+                        uint8* dst_u, uint8* dst_v, int width) =
+      RGB565ToUVRow_C;
   void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
       RGB565ToYRow_C;
 #else
   void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
       RGB565ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1057,107 +1242,130 @@
       }
     }
   }
+#elif defined(HAS_RGB565TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+    RGB565ToYRow = RGB565ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToYRow = RGB565ToYRow_MSA;
+      RGB565ToUVRow = RGB565ToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from RGB565 to ARGB.
 #else
 #if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
-    }
-  }
+      if (TestCpuFlag(kCpuHasSSE2)) {
+        RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+        if (IS_ALIGNED(width, 8)) {
+          RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+        }
+      }
 #endif
 #if defined(HAS_RGB565TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
-    }
-  }
+      if (TestCpuFlag(kCpuHasAVX2)) {
+        RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+        if (IS_ALIGNED(width, 16)) {
+          RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+        }
+      }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
+      if (TestCpuFlag(kCpuHasSSSE3)) {
+        ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+        ARGBToYRow = ARGBToYRow_Any_SSSE3;
+        if (IS_ALIGNED(width, 16)) {
+          ARGBToUVRow = ARGBToUVRow_SSSE3;
+          ARGBToYRow = ARGBToYRow_SSSE3;
+        }
+      }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
+      if (TestCpuFlag(kCpuHasAVX2)) {
+        ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+        ARGBToYRow = ARGBToYRow_Any_AVX2;
+        if (IS_ALIGNED(width, 32)) {
+          ARGBToUVRow = ARGBToUVRow_AVX2;
+          ARGBToYRow = ARGBToYRow_AVX2;
+        }
+      }
 #endif
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+#if defined(HAS_RGB565TOARGBROW_DSPR2)
+      if (TestCpuFlag(kCpuHasDSPR2)) {
+        RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
+        if (IS_ALIGNED(width, 8)) {
+          RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
+        }
+      }
+#endif
+      {
+        // Allocate 2 rows of ARGB.
+        const int kRowSize = (width * 4 + 31) & ~31;
+        align_buffer_64(row, kRowSize * 2);
 #endif
 
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
-      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
-      RGB565ToYRow(src_rgb565, dst_y, width);
-      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+  for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+    RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+    RGB565ToYRow(src_rgb565, dst_y, width);
+    RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
 #else
-      RGB565ToARGBRow(src_rgb565, row, width);
-      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+    RGB565ToARGBRow(src_rgb565, row, width);
+    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-      src_rgb565 += src_stride_rgb565 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
-      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
-      RGB565ToYRow(src_rgb565, dst_y, width);
-#else
-      RGB565ToARGBRow(src_rgb565, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_RGB565TOYROW_NEON)
-    free_aligned_buffer_64(row);
+    src_rgb565 += src_stride_rgb565 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
   }
+  if (height & 1) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+    RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+    RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+    RGB565ToARGBRow(src_rgb565, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
 #endif
-  return 0;
+  }
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+  free_aligned_buffer_64(row);
+}
+#endif
+return 0;
 }
 
 // Convert ARGB1555 to I420.
 LIBYUV_API
-int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
+int ARGB1555ToI420(const uint8* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8* dst_y,
+                   int dst_stride_y,
+                   uint8* dst_u,
+                   int dst_stride_u,
+                   uint8* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
-#if defined(HAS_ARGB1555TOYROW_NEON)
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
   void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+                          uint8* dst_u, uint8* dst_v, int width) =
+      ARGB1555ToUVRow_C;
   void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
       ARGB1555ToYRow_C;
 #else
   void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
       ARGB1555ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1179,109 +1387,124 @@
       }
     }
   }
+#elif defined(HAS_ARGB1555TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+      ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from ARGB1555 to ARGB.
 #else
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
-    }
-  }
+        if (TestCpuFlag(kCpuHasSSE2)) {
+          ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+          if (IS_ALIGNED(width, 8)) {
+            ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+          }
+        }
 #endif
 #if defined(HAS_ARGB1555TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
-    }
-  }
+        if (TestCpuFlag(kCpuHasAVX2)) {
+          ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+          if (IS_ALIGNED(width, 16)) {
+            ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+          }
+        }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
+        if (TestCpuFlag(kCpuHasSSSE3)) {
+          ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+          ARGBToYRow = ARGBToYRow_Any_SSSE3;
+          if (IS_ALIGNED(width, 16)) {
+            ARGBToUVRow = ARGBToUVRow_SSSE3;
+            ARGBToYRow = ARGBToYRow_SSSE3;
+          }
+        }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
+        if (TestCpuFlag(kCpuHasAVX2)) {
+          ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+          ARGBToYRow = ARGBToYRow_Any_AVX2;
+          if (IS_ALIGNED(width, 32)) {
+            ARGBToUVRow = ARGBToUVRow_AVX2;
+            ARGBToYRow = ARGBToYRow_AVX2;
+          }
+        }
 #endif
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 31) & ~31;
-    align_buffer_64(row, kRowSize * 2);
+        {
+          // Allocate 2 rows of ARGB.
+          const int kRowSize = (width * 4 + 31) & ~31;
+          align_buffer_64(row, kRowSize * 2);
 #endif
 
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
-      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
-      ARGB1555ToYRow(src_argb1555, dst_y, width);
-      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
-                     width);
+  for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+    ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+    ARGB1555ToYRow(src_argb1555, dst_y, width);
+    ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                   width);
 #else
-      ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
-                        width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+    ARGB1555ToARGBRow(src_argb1555, row, width);
+    ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                      width);
+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-      src_argb1555 += src_stride_argb1555 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
-      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
-      ARGB1555ToYRow(src_argb1555, dst_y, width);
-#else
-      ARGB1555ToARGBRow(src_argb1555, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_ARGB1555TOYROW_NEON)
-    free_aligned_buffer_64(row);
+    src_argb1555 += src_stride_argb1555 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
   }
+  if (height & 1) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+    ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+    ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+    ARGB1555ToARGBRow(src_argb1555, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
 #endif
-  return 0;
+  }
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+  free_aligned_buffer_64(row);
+}
+#endif
+return 0;
 }
 
 // Convert ARGB4444 to I420.
 LIBYUV_API
-int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
+int ARGB4444ToI420(const uint8* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8* dst_y,
+                   int dst_stride_y,
+                   uint8* dst_u,
+                   int dst_stride_u,
+                   uint8* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
 #if defined(HAS_ARGB4444TOYROW_NEON)
   void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+                          uint8* dst_u, uint8* dst_v, int width) =
+      ARGB4444ToUVRow_C;
   void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
       ARGB4444ToYRow_C;
 #else
   void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
       ARGB4444ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1321,6 +1544,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1341,18 +1572,30 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+      if (IS_ALIGNED(width, 32)) {
+        ARGBToUVRow = ARGBToUVRow_MSA;
+      }
+    }
+  }
+#endif
   {
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
-    for (y = 0; y < height - 1; y += 2) {
+  for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_ARGB4444TOYROW_NEON)
-      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
-      ARGB4444ToYRow(src_argb4444, dst_y, width);
-      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
-                     width);
+    ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+    ARGB4444ToYRow(src_argb4444, dst_y, width);
+    ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                   width);
 #else
       ARGB4444ToARGBRow(src_argb4444, row, width);
       ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
@@ -1361,25 +1604,107 @@
       ARGBToYRow(row, dst_y, width);
       ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
 #endif
-      src_argb4444 += src_stride_argb4444 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
+    src_argb4444 += src_stride_argb4444 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
 #if defined(HAS_ARGB4444TOYROW_NEON)
-      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
-      ARGB4444ToYRow(src_argb4444, dst_y, width);
+    ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+    ARGB4444ToYRow(src_argb4444, dst_y, width);
 #else
       ARGB4444ToARGBRow(src_argb4444, row, width);
       ARGBToUVRow(row, 0, dst_u, dst_v, width);
       ARGBToYRow(row, dst_y, width);
 #endif
-    }
-#if !defined(HAS_ARGB4444TOYROW_NEON)
-    free_aligned_buffer_64(row);
   }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+  free_aligned_buffer_64(row);
+}
 #endif
+return 0;
+}
+
+static void SplitPixels(const uint8* src_u,
+                        int src_pixel_stride_uv,
+                        uint8* dst_u,
+                        int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst_u = *src_u;
+    ++dst_u;
+    src_u += src_pixel_stride_uv;
+  }
+}
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8* src_y,
+                     int src_stride_y,
+                     const uint8* src_u,
+                     int src_stride_u,
+                     const uint8* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8* dst_y,
+                     int dst_stride_y,
+                     uint8* dst_u,
+                     int dst_stride_u,
+                     uint8* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height) {
+  int y;
+  const ptrdiff_t vu_off = src_v - src_u;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  // Copy UV planes as is - I420
+  if (src_pixel_stride_uv == 1) {
+    CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+    CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+    return 0;
+    // Split UV planes - NV21
+  } else if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+             src_stride_u == src_stride_v) {
+    SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
+                 halfwidth, halfheight);
+    return 0;
+    // Split UV planes - NV12
+  } else if (src_pixel_stride_uv == 2 && vu_off == 1 &&
+             src_stride_u == src_stride_v) {
+    SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+    return 0;
+  }
+
+  for (y = 0; y < halfheight; ++y) {
+    SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
+    SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
   return 0;
 }
 

diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
index fb9582d..5007bdb 100644
--- a/files/source/convert_argb.cc
+++ b/files/source/convert_argb.cc

@@ -26,11 +26,13 @@
 
 // Copy ARGB with optional flipping
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height) {
-  if (!src_argb || !dst_argb ||
-      width <= 0 || height == 0) {
+int ARGBCopy(const uint8* src_argb,
+             int src_stride_argb,
+             uint8* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -40,27 +42,29 @@
     src_stride_argb = -src_stride_argb;
   }
 
-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-            width * 4, height);
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+            height);
   return 0;
 }
 
 // Convert I422 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
+static int I420ToARGBMatrix(const uint8* src_y,
+                            int src_stride_y,
+                            const uint8* src_u,
+                            int src_stride_u,
+                            const uint8* src_v,
+                            int src_stride_v,
+                            uint8* dst_argb,
+                            int dst_stride_argb,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+                        const uint8* v_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -102,6 +106,14 @@
     I422ToARGBRow = I422ToARGBRow_DSPR2;
   }
 #endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -117,111 +129,130 @@
 
 // Convert I420 to ARGB.
 LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
+int I420ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I420 to ABGR.
 LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int I420ToABGR(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert J420 to ARGB.
 LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
+int J420ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
 }
 
 // Convert J420 to ABGR.
 LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int J420ToABGR(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuJPEGConstants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert H420 to ARGB.
 LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvH709Constants,
-                          width, height);
+int H420ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
 }
 
 // Convert H420 to ABGR.
 LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int H420ToABGR(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuH709Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
+static int I422ToARGBMatrix(const uint8* src_y,
+                            int src_stride_y,
+                            const uint8* src_u,
+                            int src_stride_u,
+                            const uint8* src_v,
+                            int src_stride_v,
+                            uint8* dst_argb,
+                            int dst_stride_argb,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+                        const uint8* v_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -231,10 +262,8 @@
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
@@ -272,6 +301,14 @@
     I422ToARGBRow = I422ToARGBRow_DSPR2;
   }
 #endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -285,111 +322,130 @@
 
 // Convert I422 to ARGB.
 LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
+int I422ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I422 to ABGR.
 LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int I422ToABGR(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert J422 to ARGB.
 LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
+int J422ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
 }
 
 // Convert J422 to ABGR.
 LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int J422ToABGR(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuJPEGConstants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert H422 to ARGB.
 LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvH709Constants,
-                          width, height);
+int H422ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
 }
 
 // Convert H422 to ABGR.
 LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int H422ToABGR(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuH709Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
+static int I444ToARGBMatrix(const uint8* src_y,
+                            int src_stride_y,
+                            const uint8* src_u,
+                            int src_stride_u,
+                            const uint8* src_v,
+                            int src_stride_v,
+                            uint8* dst_argb,
+                            int dst_stride_argb,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I444ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I444ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I444ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+                        const uint8* v_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -399,9 +455,7 @@
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u == width &&
-      src_stride_v == width &&
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -431,6 +485,22 @@
     }
   }
 #endif
+#if defined(HAS_I444TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -444,138 +514,81 @@
 
 // Convert I444 to ARGB.
 LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
+int I444ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I444 to ABGR.
 LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int I444ToABGR(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert J444 to ARGB.
 LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
-}
-
-// Convert I411 to ARGB.
-LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*I411ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I411ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 4 == width &&
-      src_stride_v * 4 == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I411TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I411TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I411ToARGBRow = I411ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I411TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I411ToARGBRow = I411ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
+int J444ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
 }
 
 // Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
-                                 const uint8* src_u, int src_stride_u,
-                                 const uint8* src_v, int src_stride_v,
-                                 const uint8* src_a, int src_stride_a,
-                                 uint8* dst_argb, int dst_stride_argb,
+static int I420AlphaToARGBMatrix(const uint8* src_y,
+                                 int src_stride_y,
+                                 const uint8* src_u,
+                                 int src_stride_u,
+                                 const uint8* src_v,
+                                 int src_stride_v,
+                                 const uint8* src_a,
+                                 int src_stride_a,
+                                 uint8* dst_argb,
+                                 int dst_stride_argb,
                                  const struct YuvConstants* yuvconstants,
-                                 int width, int height, int attenuate) {
+                                 int width,
+                                 int height,
+                                 int attenuate) {
   int y;
-  void (*I422AlphaToARGBRow)(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
+  void (*I422AlphaToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+                             const uint8* v_buf, const uint8* a_buf,
                              uint8* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) = I422AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBAttenuateRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -617,6 +630,14 @@
     I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
   }
 #endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -641,6 +662,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -661,49 +690,59 @@
 
 // Convert I420 with Alpha to ARGB.
 LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y,
-                               src_u, src_stride_u,
-                               src_v, src_stride_v,
-                               src_a, src_stride_a,
-                               dst_argb, dst_stride_argb,
-                               &kYuvI601Constants,
-                               width, height, attenuate);
+int I420AlphaToARGB(const uint8* src_y,
+                    int src_stride_y,
+                    const uint8* src_u,
+                    int src_stride_u,
+                    const uint8* src_v,
+                    int src_stride_v,
+                    const uint8* src_a,
+                    int src_stride_a,
+                    uint8* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, src_a, src_stride_a, dst_argb,
+                               dst_stride_argb, &kYuvI601Constants, width,
+                               height, attenuate);
 }
 
 // Convert I420 with Alpha to ABGR.
 LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height, int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y,
-                               src_v, src_stride_v,  // Swap U and V
-                               src_u, src_stride_u,
-                               src_a, src_stride_a,
-                               dst_abgr, dst_stride_abgr,
-                               &kYvuI601Constants,  // Use Yvu matrix
-                               width, height, attenuate);
+int I420AlphaToABGR(const uint8* src_y,
+                    int src_stride_y,
+                    const uint8* src_u,
+                    int src_stride_u,
+                    const uint8* src_v,
+                    int src_stride_v,
+                    const uint8* src_a,
+                    int src_stride_a,
+                    uint8* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(
+      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
+      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+      &kYvuI601Constants,  // Use Yvu matrix
+      width, height, attenuate);
 }
 
 // Convert I400 to ARGB.
 LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int I400ToARGB(const uint8* src_y,
+               int src_stride_y,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*I400ToARGBRow)(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) = I400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I400ToARGBRow)(const uint8* y_buf, uint8* rgb_buf, int width) =
+      I400ToARGBRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -713,8 +752,7 @@
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -743,6 +781,14 @@
     }
   }
 #endif
+#if defined(HAS_I400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I400ToARGBRow = I400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I400ToARGBRow(src_y, dst_argb, width);
@@ -754,14 +800,16 @@
 
 // Convert J400 to ARGB.
 LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int J400ToARGB(const uint8* src_y,
+               int src_stride_y,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
   void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
       J400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -771,8 +819,7 @@
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -801,6 +848,14 @@
     }
   }
 #endif
+#if defined(HAS_J400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    J400ToARGBRow = J400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_MSA;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
     J400ToARGBRow(src_y, dst_argb, width);
     src_y += src_stride_y;
@@ -810,85 +865,89 @@
 }
 
 // Shuffle table for converting BGRA to ARGB.
-static uvec8 kShuffleMaskBGRAToARGB = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
+static uvec8 kShuffleMaskBGRAToARGB = {3u,  2u,  1u, 0u, 7u,  6u,  5u,  4u,
+                                       11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
 
 // Shuffle table for converting ABGR to ARGB.
-static uvec8 kShuffleMaskABGRToARGB = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
+static uvec8 kShuffleMaskABGRToARGB = {2u,  1u, 0u, 3u,  6u,  5u,  4u,  7u,
+                                       10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
 
 // Shuffle table for converting RGBA to ARGB.
-static uvec8 kShuffleMaskRGBAToARGB = {
-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
+static uvec8 kShuffleMaskRGBAToARGB = {1u, 2u,  3u,  0u, 5u,  6u,  7u,  4u,
+                                       9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
 
 // Convert BGRA to ARGB.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
+int BGRAToARGB(const uint8* src_bgra,
+               int src_stride_bgra,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB), width, height);
 }
 
 // Convert ARGB to BGRA (same as BGRAToARGB).
 LIBYUV_API
-int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
+int ARGBToBGRA(const uint8* src_bgra,
+               int src_stride_bgra,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB), width, height);
 }
 
 // Convert ABGR to ARGB.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
+int ABGRToARGB(const uint8* src_abgr,
+               int src_stride_abgr,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB), width, height);
 }
 
 // Convert ARGB to ABGR to (same as ABGRToARGB).
 LIBYUV_API
-int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
+int ARGBToABGR(const uint8* src_abgr,
+               int src_stride_abgr,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB), width, height);
 }
 
 // Convert RGBA to ARGB.
 LIBYUV_API
-int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_rgba, src_stride_rgba,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskRGBAToARGB),
-                     width, height);
+int RGBAToARGB(const uint8* src_rgba,
+               int src_stride_rgba,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskRGBAToARGB), width, height);
 }
 
 // Convert RGB24 to ARGB.
 LIBYUV_API
-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
+int RGB24ToARGB(const uint8* src_rgb24,
+                int src_stride_rgb24,
+                uint8* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
   int y;
   void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  if (!src_rgb24 || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -898,8 +957,7 @@
     src_stride_rgb24 = -src_stride_rgb24;
   }
   // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb24 = dst_stride_argb = 0;
@@ -920,6 +978,22 @@
     }
   }
 #endif
+#if defined(HAS_RGB24TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -931,14 +1005,16 @@
 
 // Convert RAW to ARGB.
 LIBYUV_API
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+int RAWToARGB(const uint8* src_raw,
+              int src_stride_raw,
+              uint8* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
   int y;
   void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
       RAWToARGBRow_C;
-  if (!src_raw || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -948,8 +1024,7 @@
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_argb = 0;
@@ -970,6 +1045,22 @@
     }
   }
 #endif
+#if defined(HAS_RAWTOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RAWToARGBRow = RAWToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToARGBRow = RAWToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToARGBRow(src_raw, dst_argb, width);
@@ -981,14 +1072,16 @@
 
 // Convert RGB565 to ARGB.
 LIBYUV_API
-int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int RGB565ToARGB(const uint8* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
   void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
       RGB565ToARGBRow_C;
-  if (!src_rgb565 || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -998,8 +1091,7 @@
     src_stride_rgb565 = -src_stride_rgb565;
   }
   // Coalesce rows.
-  if (src_stride_rgb565 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb565 = dst_stride_argb = 0;
@@ -1028,6 +1120,22 @@
     }
   }
 #endif
+#if defined(HAS_RGB565TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB565ToARGBRow(src_rgb565, dst_argb, width);
@@ -1039,14 +1147,16 @@
 
 // Convert ARGB1555 to ARGB.
 LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
+int ARGB1555ToARGB(const uint8* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   int y;
   void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
-      int width) = ARGB1555ToARGBRow_C;
-  if (!src_argb1555 || !dst_argb ||
-      width <= 0 || height == 0) {
+                            int width) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1056,8 +1166,7 @@
     src_stride_argb1555 = -src_stride_argb1555;
   }
   // Coalesce rows.
-  if (src_stride_argb1555 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb1555 = dst_stride_argb = 0;
@@ -1086,6 +1195,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGB1555TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
@@ -1097,14 +1222,16 @@
 
 // Convert ARGB4444 to ARGB.
 LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
+int ARGB4444ToARGB(const uint8* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   int y;
   void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
-      int width) = ARGB4444ToARGBRow_C;
-  if (!src_argb4444 || !dst_argb ||
-      width <= 0 || height == 0) {
+                            int width) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1114,8 +1241,7 @@
     src_stride_argb4444 = -src_stride_argb4444;
   }
   // Coalesce rows.
-  if (src_stride_argb4444 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb4444 = dst_stride_argb = 0;
@@ -1144,6 +1270,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
@@ -1155,18 +1297,19 @@
 
 // Convert NV12 to ARGB.
 LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int NV12ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_uv,
+               int src_stride_uv,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1199,6 +1342,22 @@
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
@@ -1213,18 +1372,19 @@
 
 // Convert NV21 to ARGB.
 LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int NV21ToARGB(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_uv,
+               int src_stride_uv,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV21ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*NV21ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      NV21ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1257,6 +1417,14 @@
     }
   }
 #endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
@@ -1271,17 +1439,17 @@
 
 // Convert M420 to ARGB.
 LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int M420ToARGB(const uint8* src_m420,
+               int src_stride_m420,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_m420 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      NV12ToARGBRow_C;
+  if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1314,6 +1482,22 @@
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
@@ -1332,17 +1516,17 @@
 
 // Convert YUY2 to ARGB.
 LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int YUY2ToARGB(const uint8* src_yuy2,
+               int src_stride_yuy2,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToARGBRow)(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) =
+  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
       YUY2ToARGBRow_C;
-  if (!src_yuy2 || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1352,8 +1536,7 @@
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_argb = 0;
@@ -1382,6 +1565,14 @@
     }
   }
 #endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
     YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
     src_yuy2 += src_stride_yuy2;
@@ -1392,17 +1583,17 @@
 
 // Convert UYVY to ARGB.
 LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int UYVYToARGB(const uint8* src_uyvy,
+               int src_stride_uyvy,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToARGBRow)(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) =
+  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
       UYVYToARGBRow_C;
-  if (!src_uyvy || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1412,8 +1603,7 @@
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_argb = 0;
@@ -1442,6 +1632,14 @@
     }
   }
 #endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_MSA;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
     UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
     src_uyvy += src_stride_uyvy;

diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
index 46abdeb..e6ff524 100644
--- a/files/source/convert_from.cc
+++ b/files/source/convert_from.cc

@@ -30,107 +30,100 @@
 }
 
 // I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int dst_uv_width, int dst_uv_height) {
+static int I420ToI4xx(const uint8* src_y,
+                      int src_stride_y,
+                      const uint8* src_u,
+                      int src_stride_u,
+                      const uint8* src_v,
+                      int src_stride_v,
+                      uint8* dst_y,
+                      int dst_stride_y,
+                      uint8* dst_u,
+                      int dst_stride_u,
+                      uint8* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int dst_uv_width,
+                      int dst_uv_height) {
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
   const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 ||
-      dst_uv_width <= 0 || dst_uv_height <= 0) {
+  if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+      dst_uv_height <= 0) {
     return -1;
   }
-  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-             dst_y, dst_stride_y, dst_y_width, dst_y_height,
-             kFilterBilinear);
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+  }
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
   return 0;
 }
 
 // 420 chroma is 1/2 width, 1/2 height
 // 422 chroma is 1/2 width, 1x height
 LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420ToI422(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int dst_uv_width = (Abs(width) + 1) >> 1;
   const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
 }
 
 // 420 chroma is 1/2 width, 1/2 height
 // 444 chroma is 1x width, 1x height
 LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420ToI444(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int dst_uv_width = Abs(width);
   const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 411 chroma is 1/4 width, 1x height
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int dst_uv_width = (Abs(width) + 3) >> 2;
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
 }
 
 // Copy to I400. Source can be I420,422,444,400,NV12,NV21
 LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
-             uint8* dst_y, int dst_stride_y,
-             int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
+int I400Copy(const uint8* src_y,
+             int src_stride_y,
+             uint8* dst_y,
+             int dst_stride_y,
+             int width,
+             int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -144,17 +137,21 @@
 }
 
 LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int I422ToYUY2(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
   void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
                         const uint8* src_v, uint8* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -164,10 +161,8 @@
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_yuy2 == width * 2) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
@@ -200,17 +195,21 @@
 }
 
 LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int I420ToYUY2(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
   void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
                         const uint8* src_v, uint8* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -235,6 +234,14 @@
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -252,17 +259,21 @@
 }
 
 LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int I422ToUYVY(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
   void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
                         const uint8* src_v, uint8* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -272,10 +283,8 @@
     dst_stride_uyvy = -dst_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_uyvy == width * 2) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
@@ -296,6 +305,14 @@
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -308,17 +325,21 @@
 }
 
 LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int I420ToUYVY(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
   void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
                         const uint8* src_v, uint8* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -343,6 +364,14 @@
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -359,113 +388,70 @@
   return 0;
 }
 
+// TODO(fbarchard): test negative height for invert.
 LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
-  int y;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-      int width) = MergeUVRow_C;
-  // Coalesce rows.
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+int I420ToNV12(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+      height == 0) {
     return -1;
   }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_uv = -dst_stride_uv;
+  int halfwidth = (width + 1) / 2;
+  int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
-    width *= height;
-    height = 1;
-    src_stride_y = dst_stride_y = 0;
-  }
-  // Coalesce rows.
-  if (src_stride_u == halfwidth &&
-      src_stride_v == halfwidth &&
-      dst_stride_uv == halfwidth * 2) {
-    halfwidth *= halfheight;
-    halfheight = 1;
-    src_stride_u = src_stride_v = dst_stride_uv = 0;
-  }
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow_ = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow_ = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow_ = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow_ = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_NEON;
-    }
-  }
-#endif
-
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  for (y = 0; y < halfheight; ++y) {
-    // Merge a row of U and V into a row of UV.
-    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-    dst_uv += dst_stride_uv;
-  }
+  MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
+               halfwidth, halfheight);
   return 0;
 }
 
 LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height) {
-  return I420ToNV12(src_y, src_stride_y,
-                    src_v, src_stride_v,
-                    src_u, src_stride_u,
-                    dst_y, dst_stride_y,
-                    dst_vu, dst_stride_vu,
+int I420ToNV21(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
                     width, height);
 }
 
 // Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_rgba, int dst_stride_rgba,
+static int I420ToRGBAMatrix(const uint8* src_y,
+                            int src_stride_y,
+                            const uint8* src_u,
+                            int src_stride_u,
+                            const uint8* src_v,
+                            int src_stride_v,
+                            uint8* dst_rgba,
+                            int dst_stride_rgba,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf,
+                        const uint8* v_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -507,6 +493,14 @@
     I422ToRGBARow = I422ToRGBARow_DSPR2;
   }
 #endif
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@@ -522,50 +516,58 @@
 
 // Convert I420 to RGBA.
 LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants,
-                          width, height);
+int I420ToRGBA(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I420 to BGRA.
 LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_bgra, dst_stride_bgra,
+int I420ToBGRA(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
-                             const uint8* src_u, int src_stride_u,
-                             const uint8* src_v, int src_stride_v,
-                             uint8* dst_rgb24, int dst_stride_rgb24,
+static int I420ToRGB24Matrix(const uint8* src_y,
+                             int src_stride_y,
+                             const uint8* src_u,
+                             int src_stride_u,
+                             const uint8* src_v,
+                             int src_stride_v,
+                             uint8* dst_rgb24,
+                             int dst_stride_rgb24,
                              const struct YuvConstants* yuvconstants,
-                             int width, int height) {
+                             int width,
+                             int height) {
   int y;
-  void (*I422ToRGB24Row)(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) = I422ToRGB24Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGB24Row)(const uint8* y_buf, const uint8* u_buf,
+                         const uint8* v_buf, uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -598,6 +600,14 @@
     }
   }
 #endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@@ -613,50 +623,59 @@
 
 // Convert I420 to RGB24.
 LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y,
-                           src_u, src_stride_u,
-                           src_v, src_stride_v,
-                           dst_rgb24, dst_stride_rgb24,
-                           &kYuvI601Constants,
-                           width, height);
+int I420ToRGB24(const uint8* src_y,
+                int src_stride_y,
+                const uint8* src_u,
+                int src_stride_u,
+                const uint8* src_v,
+                int src_stride_v,
+                uint8* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
 }
 
 // Convert I420 to RAW.
 LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y,
-                           src_v, src_stride_v,  // Swap U and V
-                           src_u, src_stride_u,
-                           dst_raw, dst_stride_raw,
+int I420ToRAW(const uint8* src_y,
+              int src_stride_y,
+              const uint8* src_u,
+              int src_stride_u,
+              const uint8* src_v,
+              int src_stride_v,
+              uint8* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
                            &kYvuI601Constants,  // Use Yvu matrix
                            width, height);
 }
 
 // Convert I420 to ARGB1555.
 LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
+int I420ToARGB1555(const uint8* src_y,
+                   int src_stride_y,
+                   const uint8* src_u,
+                   int src_stride_u,
+                   const uint8* src_v,
+                   int src_stride_v,
+                   uint8* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
   int y;
-  void (*I422ToARGB1555Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+  void (*I422ToARGB1555Row)(const uint8* y_buf, const uint8* u_buf,
+                            const uint8* v_buf, uint8* rgb_buf,
                             const struct YuvConstants* yuvconstants,
                             int width) = I422ToARGB1555Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -689,6 +708,22 @@
     }
   }
 #endif
+#if defined(HAS_I422TOARGB1555ROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_DSPR2;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
@@ -703,23 +738,25 @@
   return 0;
 }
 
-
 // Convert I420 to ARGB4444.
 LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
+int I420ToARGB4444(const uint8* src_y,
+                   int src_stride_y,
+                   const uint8* src_u,
+                   int src_stride_u,
+                   const uint8* src_v,
+                   int src_stride_v,
+                   uint8* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
   int y;
-  void (*I422ToARGB4444Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+  void (*I422ToARGB4444Row)(const uint8* y_buf, const uint8* u_buf,
+                            const uint8* v_buf, uint8* rgb_buf,
                             const struct YuvConstants* yuvconstants,
                             int width) = I422ToARGB4444Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -752,6 +789,22 @@
     }
   }
 #endif
+#if defined(HAS_I422TOARGB4444ROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_DSPR2;
+    if (IS_ALIGNED(width, 4)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
@@ -768,20 +821,22 @@
 
 // Convert I420 to RGB565.
 LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int I420ToRGB565(const uint8* src_y,
+                 int src_stride_y,
+                 const uint8* src_u,
+                 int src_stride_u,
+                 const uint8* src_v,
+                 int src_stride_v,
+                 uint8* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
-  void (*I422ToRGB565Row)(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) = I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf,
+                          const uint8* v_buf, uint8* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -814,6 +869,14 @@
     }
   }
 #endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
@@ -829,30 +892,31 @@
 
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
 static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
 };
 
 // Convert I420 to RGB565 with dithering.
 LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
+int I420ToRGB565Dither(const uint8* src_y,
+                       int src_stride_y,
+                       const uint8* src_u,
+                       int src_stride_u,
+                       const uint8* src_v,
+                       int src_stride_v,
+                       uint8* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8* dither4x4,
+                       int width,
+                       int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
+  void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+                        const uint8* v_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
   void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
+                                const uint32 dither4, int width) =
+      ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -896,6 +960,14 @@
     I422ToARGBRow = I422ToARGBRow_DSPR2;
   }
 #endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
@@ -920,13 +992,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
   {
     // Allocate a row of argb.
     align_buffer_64(row_argb, width * 4);
     for (y = 0; y < height; ++y) {
       I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
       ARGBToRGB565DitherRow(row_argb, dst_rgb565,
-                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+                            *(uint32*)(dither4x4 + ((y & 3) << 2)),
+                            width);  // NOLINT
       dst_rgb565 += dst_stride_rgb565;
       src_y += src_stride_y;
       if (y & 1) {
@@ -941,218 +1022,156 @@
 
 // Convert I420 to specified format
 LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
-                    const uint8* u, int u_stride,
-                    const uint8* v, int v_stride,
-                    uint8* dst_sample, int dst_sample_stride,
-                    int width, int height,
+int ConvertFromI420(const uint8* y,
+                    int y_stride,
+                    const uint8* u,
+                    int u_stride,
+                    const uint8* v,
+                    int v_stride,
+                    uint8* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
                     uint32 fourcc) {
   uint32 format = CanonicalFourCC(fourcc);
   int r = 0;
-  if (!y || !u|| !v || !dst_sample ||
-      width <= 0 || height == 0) {
+  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
     return -1;
   }
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
-      r = I420ToYUY2(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
+      r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
       break;
     case FOURCC_UYVY:
-      r = I420ToUYVY(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
+      r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
       break;
     case FOURCC_RGBP:
-      r = I420ToRGB565(y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       dst_sample,
-                       dst_sample_stride ? dst_sample_stride : width * 2,
-                       width, height);
+      r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2, width,
+                       height);
       break;
     case FOURCC_RGBO:
-      r = I420ToARGB1555(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
+      r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                          dst_sample_stride ? dst_sample_stride : width * 2,
                          width, height);
       break;
     case FOURCC_R444:
-      r = I420ToARGB4444(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
+      r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                          dst_sample_stride ? dst_sample_stride : width * 2,
                          width, height);
       break;
     case FOURCC_24BG:
-      r = I420ToRGB24(y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width * 3,
-                      width, height);
+      r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3, width,
+                      height);
       break;
     case FOURCC_RAW:
-      r = I420ToRAW(y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    dst_sample,
-                    dst_sample_stride ? dst_sample_stride : width * 3,
-                    width, height);
+      r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3, width,
+                    height);
       break;
     case FOURCC_ARGB:
-      r = I420ToARGB(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_BGRA:
-      r = I420ToBGRA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_ABGR:
-      r = I420ToABGR(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_RGBA:
-      r = I420ToRGBA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_I400:
-      r = I400Copy(y, y_stride,
-                   dst_sample,
-                   dst_sample_stride ? dst_sample_stride : width,
-                   width, height);
+      r = I400Copy(y, y_stride, dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width, width,
+                   height);
       break;
     case FOURCC_NV12: {
       uint8* dst_uv = dst_sample + width * height;
-      r = I420ToNV12(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_uv,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
+      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
       break;
     }
     case FOURCC_NV21: {
       uint8* dst_vu = dst_sample + width * height;
-      r = I420ToNV21(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_vu,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
+      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
       break;
     }
     // TODO(fbarchard): Add M420.
     // Triplanar formats
-    // TODO(fbarchard): halfstride instead of halfwidth
     case FOURCC_I420:
     case FOURCC_YV12: {
-      int halfwidth = (width + 1) / 2;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
       int halfheight = (height + 1) / 2;
       uint8* dst_u;
       uint8* dst_v;
       if (format == FOURCC_YV12) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * halfheight;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * halfheight;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * halfheight;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * halfheight;
       }
-      r = I420Copy(y, y_stride,
-                   u, u_stride,
-                   v, v_stride,
-                   dst_sample, width,
-                   dst_u, halfwidth,
-                   dst_v, halfwidth,
+      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
                    width, height);
       break;
     }
     case FOURCC_I422:
     case FOURCC_YV16: {
-      int halfwidth = (width + 1) / 2;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
       uint8* dst_u;
       uint8* dst_v;
       if (format == FOURCC_YV16) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * height;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * height;
       }
-      r = I420ToI422(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, halfwidth,
-                     dst_v, halfwidth,
+      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
                      width, height);
       break;
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
       uint8* dst_u;
       uint8* dst_v;
       if (format == FOURCC_YV24) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + width * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + dst_sample_stride * height;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + width * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + dst_sample_stride * height;
       }
-      r = I420ToI444(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, width,
-                     dst_v, width,
-                     width, height);
+      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+                     dst_sample_stride, width, height);
       break;
     }
-    case FOURCC_I411: {
-      int quarterwidth = (width + 3) / 4;
-      uint8* dst_u = dst_sample + width * height;
-      uint8* dst_v = dst_u + quarterwidth * height;
-      r = I420ToI411(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, quarterwidth,
-                     dst_v, quarterwidth,
-                     width, height);
-      break;
-    }
-
     // Formats not supported - MJPG, biplanar, some rgb formats.
     default:
       return -1;  // unknown fourcc - return failure code.

diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc
index 2a8682b..88f3827 100644
--- a/files/source/convert_from_argb.cc
+++ b/files/source/convert_from_argb.cc

@@ -22,16 +22,21 @@
 
 // ARGB little endian (bgra in memory) to I444
 LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI444(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
   void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int width) = ARGBToUV444Row_C;
+                         int width) = ARGBToUV444Row_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
@@ -41,20 +46,18 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u == width &&
-      dst_stride_v == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u == width && dst_stride_v == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOUV444ROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-      }
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+    }
   }
 #endif
 #if defined(HAS_ARGBTOUV444ROW_NEON)
@@ -65,6 +68,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -89,6 +100,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -103,19 +130,22 @@
 
 // ARGB little endian (bgra in memory) to I422
 LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI422(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -125,10 +155,8 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -169,6 +197,39 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
@@ -181,95 +242,24 @@
   return 0;
 }
 
-// ARGB little endian (bgra in memory) to I411
 LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int width) = ARGBToUV411Row_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 4 == width &&
-      dst_stride_v * 4 == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV411ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUV411Row = ARGBToUV411Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int ARGBToNV12(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                       int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -314,6 +304,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -338,6 +344,30 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
@@ -364,21 +394,23 @@
 
 // Same as NV12 but U and V swapped.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int ARGBToNV21(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                       int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -423,6 +455,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -447,6 +495,30 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
@@ -473,19 +545,22 @@
 
 // Convert ARGB to YUY2.
 LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int ARGBToYUY2(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
   void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
 
-  if (!src_argb || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -495,8 +570,7 @@
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yuy2 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yuy2 = 0;
@@ -537,6 +611,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -553,6 +643,30 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
@@ -575,19 +689,22 @@
 
 // Convert ARGB to UYVY.
 LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int ARGBToUYVY(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
+                      uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
   void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
 
-  if (!src_argb || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -597,8 +714,7 @@
     dst_stride_uyvy = -dst_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_uyvy == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_uyvy = 0;
@@ -639,6 +755,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -655,6 +787,30 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
@@ -677,9 +833,12 @@
 
 // Convert ARGB to I400.
 LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int ARGBToI400(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
       ARGBToYRow_C;
@@ -692,8 +851,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = 0;
@@ -722,6 +880,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    ARGBToYRow = ARGBToYRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYRow(src_argb, dst_y, width);
@@ -732,26 +906,29 @@
 }
 
 // Shuffle table for converting ARGB to RGBA.
-static uvec8 kShuffleMaskARGBToRGBA = {
-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
+static uvec8 kShuffleMaskARGBToRGBA = {3u,  0u, 1u, 2u,  7u,  4u,  5u,  6u,
+                                       11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
 
 // Convert ARGB to RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return ARGBShuffle(src_argb, src_stride_argb,
-                     dst_rgba, dst_stride_rgba,
-                     (const uint8*)(&kShuffleMaskARGBToRGBA),
-                     width, height);
+int ARGBToRGBA(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
+                     (const uint8*)(&kShuffleMaskARGBToRGBA), width, height);
 }
 
 // Convert ARGB To RGB24.
 LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
+int ARGBToRGB24(const uint8* src_argb,
+                int src_stride_argb,
+                uint8* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
   int y;
   void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
       ARGBToRGB24Row_C;
@@ -764,8 +941,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb24 == width * 3) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb24 = 0;
@@ -786,6 +962,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -797,9 +981,12 @@
 
 // Convert ARGB To RAW.
 LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
+int ARGBToRAW(const uint8* src_argb,
+              int src_stride_argb,
+              uint8* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
   int y;
   void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
       ARGBToRAWRow_C;
@@ -812,8 +999,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_raw == width * 3) {
+  if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_raw = 0;
@@ -834,6 +1020,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRAWRow(src_argb, dst_raw, width);
@@ -845,20 +1039,22 @@
 
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
 static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
 };
 
 // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
 LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
+int ARGBToRGB565Dither(const uint8* src_argb,
+                       int src_stride_argb,
+                       uint8* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8* dither4x4,
+                       int width,
+                       int height) {
   int y;
   void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+                                const uint32 dither4, int width) =
+      ARGBToRGB565DitherRow_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -894,9 +1090,19 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBToRGB565DitherRow(src_argb, dst_rgb565,
-                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+                          *(uint32*)(dither4x4 + ((y & 3) << 2)),
+                          width); /* NOLINT */
     src_argb += src_stride_argb;
     dst_rgb565 += dst_stride_rgb565;
   }
@@ -906,9 +1112,12 @@
 // Convert ARGB To RGB565.
 // TODO(fbarchard): Consider using dither function low level with zeros.
 LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int ARGBToRGB565(const uint8* src_argb,
+                 int src_stride_argb,
+                 uint8* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
   void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
       ARGBToRGB565Row_C;
@@ -921,8 +1130,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb565 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb565 = 0;
@@ -951,6 +1159,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB565Row(src_argb, dst_rgb565, width);
@@ -962,9 +1178,12 @@
 
 // Convert ARGB To ARGB1555.
 LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
+int ARGBToARGB1555(const uint8* src_argb,
+                   int src_stride_argb,
+                   uint8* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
   int y;
   void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
       ARGBToARGB1555Row_C;
@@ -977,8 +1196,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb1555 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb1555 = 0;
@@ -1007,6 +1225,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToARGB1555Row(src_argb, dst_argb1555, width);
@@ -1018,9 +1244,12 @@
 
 // Convert ARGB To ARGB4444.
 LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
+int ARGBToARGB4444(const uint8* src_argb,
+                   int src_stride_argb,
+                   uint8* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
   int y;
   void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
       ARGBToARGB4444Row_C;
@@ -1033,8 +1262,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb4444 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb4444 = 0;
@@ -1063,6 +1291,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToARGB4444Row(src_argb, dst_argb4444, width);
@@ -1074,19 +1310,22 @@
 
 // Convert ARGB to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToJ420(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_yj,
+               int dst_stride_yj,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb ||
-      !dst_yj || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1129,6 +1368,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -1148,19 +1403,22 @@
 
 // Convert ARGB to J422. (JPeg full range I422).
 LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToJ422(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_yj,
+               int dst_stride_yj,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb ||
-      !dst_yj || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1170,10 +1428,8 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yj == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
@@ -1212,6 +1468,22 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
@@ -1226,9 +1498,12 @@
 
 // Convert ARGB to J400.
 LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height) {
+int ARGBToJ400(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
   int y;
   void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
       ARGBToYJRow_C;
@@ -1241,8 +1516,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yj == width) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yj = 0;
@@ -1271,6 +1545,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYJRow(src_argb, dst_yj, width);

diff --git a/files/source/convert_jpeg.cc b/files/source/convert_jpeg.cc
index 90f550a..216a9f2 100644
--- a/files/source/convert_jpeg.cc
+++ b/files/source/convert_jpeg.cc

@@ -37,13 +37,9 @@
                          const int* strides,
                          int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I420Copy(data[0], strides[0],
-           data[1], strides[1],
-           data[2], strides[2],
-           dest->y, dest->y_stride,
-           dest->u, dest->u_stride,
-           dest->v, dest->v_stride,
-           dest->w, rows);
+  I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+           dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+           dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -55,13 +51,9 @@
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I422ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -73,31 +65,9 @@
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I444ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I411ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -109,11 +79,8 @@
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I400ToI420(data[0], strides[0],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
+             dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -122,8 +89,7 @@
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
-             int* width, int* height) {
+int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) {
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
   if (ret) {
@@ -139,11 +105,16 @@
 LIBYUV_API
 int MJPGToI420(const uint8* sample,
                size_t sample_size,
-               uint8* y, int y_stride,
-               uint8* u, int u_stride,
-               uint8* v, int v_stride,
-               int w, int h,
-               int dw, int dh) {
+               uint8* y,
+               int y_stride,
+               uint8* u,
+               int u_stride,
+               uint8* v,
+               int v_stride,
+               int w,
+               int h,
+               int dw,
+               int dh) {
   if (sample_size == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
@@ -152,17 +123,16 @@
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
+  if (ret &&
+      (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
     // ERROR: MJPEG frame has unexpected dimensions
     mjpeg_decoder.UnloadFrame();
     return 1;  // runtime failure
   }
   if (ret) {
-    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    I420Buffers bufs = {y, y_stride, u, u_stride, v, v_stride, dw, dh};
     // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
         mjpeg_decoder.GetNumComponents() == 3 &&
         mjpeg_decoder.GetVertSampFactor(0) == 2 &&
         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -171,7 +141,7 @@
         mjpeg_decoder.GetVertSampFactor(2) == 1 &&
         mjpeg_decoder.GetHorizSampFactor(2) == 1) {
       ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
-    // YUV422
+      // YUV422
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -182,7 +152,7 @@
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
       ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
-    // YUV444
+      // YUV444
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -193,18 +163,7 @@
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
       ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
-    // YUV400
+      // YUV400
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceGrayscale &&
                mjpeg_decoder.GetNumComponents() == 1 &&
@@ -213,7 +172,7 @@
       ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
     } else {
       // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
+      // factors that occur in practice.
       // ERROR: Unable to convert MJPEG frame because format is not supported
       mjpeg_decoder.UnloadFrame();
       return 1;
@@ -231,15 +190,12 @@
 };
 
 static void JpegI420ToARGB(void* opaque,
-                         const uint8* const* data,
-                         const int* strides,
-                         int rows) {
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I420ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
@@ -249,11 +205,8 @@
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I422ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
@@ -263,25 +216,8 @@
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I444ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I411ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
@@ -291,9 +227,7 @@
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I400ToARGB(data[0], strides[0],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
@@ -303,9 +237,12 @@
 LIBYUV_API
 int MJPGToARGB(const uint8* sample,
                size_t sample_size,
-               uint8* argb, int argb_stride,
-               int w, int h,
-               int dw, int dh) {
+               uint8* argb,
+               int argb_stride,
+               int w,
+               int h,
+               int dw,
+               int dh) {
   if (sample_size == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
@@ -314,17 +251,16 @@
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
+  if (ret &&
+      (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
     // ERROR: MJPEG frame has unexpected dimensions
     mjpeg_decoder.UnloadFrame();
     return 1;  // runtime failure
   }
   if (ret) {
-    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    ARGBBuffers bufs = {argb, argb_stride, dw, dh};
     // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
         mjpeg_decoder.GetNumComponents() == 3 &&
         mjpeg_decoder.GetVertSampFactor(0) == 2 &&
         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -333,7 +269,7 @@
         mjpeg_decoder.GetVertSampFactor(2) == 1 &&
         mjpeg_decoder.GetHorizSampFactor(2) == 1) {
       ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
-    // YUV422
+      // YUV422
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -344,7 +280,7 @@
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
       ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
-    // YUV444
+      // YUV444
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -355,18 +291,7 @@
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
       ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
-    // YUV400
+      // YUV400
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceGrayscale &&
                mjpeg_decoder.GetNumComponents() == 1 &&
@@ -375,7 +300,7 @@
       ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
     } else {
       // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
+      // factors that occur in practice.
       // ERROR: Unable to convert MJPEG frame because format is not supported
       mjpeg_decoder.UnloadFrame();
       return 1;

diff --git a/files/source/convert_to_argb.cc b/files/source/convert_to_argb.cc
index bccb34c..63a5104 100644
--- a/files/source/convert_to_argb.cc
+++ b/files/source/convert_to_argb.cc

@@ -29,11 +29,16 @@
 // sample_size is measured in bytes and is the size of the frame.
 //   With MJPEG it is the compressed size of the frame.
 LIBYUV_API
-int ConvertToARGB(const uint8* sample, size_t sample_size,
-                  uint8* crop_argb, int argb_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToARGB(const uint8* sample,
+                  size_t sample_size,
+                  uint8* crop_argb,
+                  int argb_stride,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
                   uint32 fourcc) {
   uint32 format = CanonicalFourCC(fourcc);
@@ -49,16 +54,15 @@
   // and then rotate the ARGB to the final destination buffer.
   // For in-place conversion, if destination crop_argb is same as source sample,
   // also enable temporary buffer.
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
-      crop_argb == sample;
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_ARGB) || crop_argb == sample;
   uint8* dest_argb = crop_argb;
   int dest_argb_stride = argb_stride;
   uint8* rotate_buffer = NULL;
   int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
 
-  if (crop_argb == NULL || sample == NULL ||
-      src_width <= 0 || crop_width <= 0 ||
-      src_height == 0 || crop_height == 0) {
+  if (crop_argb == NULL || sample == NULL || src_width <= 0 ||
+      crop_width <= 0 || src_height == 0 || crop_height == 0) {
     return -1;
   }
   if (src_height < 0) {
@@ -67,7 +71,7 @@
 
   if (need_buf) {
     int argb_size = crop_width * 4 * abs_crop_height;
-    rotate_buffer = (uint8*)malloc(argb_size);
+    rotate_buffer = (uint8*)malloc(argb_size); /* NOLINT */
     if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
@@ -79,102 +83,85 @@
     // Single plane formats
     case FOURCC_YUY2:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
+      r = YUY2ToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_UYVY:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
+      r = UYVYToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_24BG:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToARGB(src, src_width * 3,
-                      crop_argb, argb_stride,
-                      crop_width, inv_crop_height);
+      r = RGB24ToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
+                      inv_crop_height);
       break;
     case FOURCC_RAW:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToARGB(src, src_width * 3,
-                    crop_argb, argb_stride,
-                    crop_width, inv_crop_height);
+      r = RAWToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
+                    inv_crop_height);
       break;
     case FOURCC_ARGB:
-      if (!need_buf && !rotation ) {
+      if (!need_buf && !rotation) {
         src = sample + (src_width * crop_y + crop_x) * 4;
-        r = ARGBToARGB(src, src_width * 4,
-                       crop_argb, argb_stride,
-                       crop_width, inv_crop_height);
+        r = ARGBToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+                       inv_crop_height);
       }
       break;
     case FOURCC_BGRA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = BGRAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_ABGR:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = ABGRToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = RGBAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToARGB(src, src_width * 2,
-                       crop_argb, argb_stride,
-                       crop_width, inv_crop_height);
+      r = RGB565ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
+                       inv_crop_height);
       break;
     case FOURCC_RGBO:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB1555ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
+                         inv_crop_height);
       break;
     case FOURCC_R444:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB4444ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
+                         inv_crop_height);
       break;
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
-      r = I400ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I400ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
+                     inv_crop_height);
       break;
 
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
       src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      r = NV12ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
+                     argb_stride, crop_width, inv_crop_height);
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
       src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
       // Call NV12 but with u and v parameters swapped.
-      r = NV21ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
+                     argb_stride, crop_width, inv_crop_height);
       break;
     case FOURCC_M420:
       src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = M420ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
+                     inv_crop_height);
       break;
     // Triplanar formats
     case FOURCC_I420:
@@ -186,20 +173,17 @@
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
         src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       } else {
         src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       }
-      r = I420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     crop_argb, argb_stride, crop_width, inv_crop_height);
       break;
     }
 
@@ -210,14 +194,11 @@
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       src_u = sample + src_width * abs_src_height +
-          (halfwidth * crop_y + crop_x) / 2;
+              (halfwidth * crop_y + crop_x) / 2;
       src_v = sample + src_width * abs_src_height +
-          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = J420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+              halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     crop_argb, argb_stride, crop_width, inv_crop_height);
       break;
     }
 
@@ -228,21 +209,18 @@
       const uint8* src_v;
       int halfwidth = (src_width + 1) / 2;
       if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       }
-      r = I422ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     crop_argb, argb_stride, crop_width, inv_crop_height);
       break;
     }
     case FOURCC_I444:
@@ -257,32 +235,14 @@
         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
-      r = I444ToARGB(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToARGB(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     crop_argb, argb_stride, crop_width, inv_crop_height);
       break;
     }
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
-      r = MJPGToARGB(sample, sample_size,
-                     crop_argb, argb_stride,
-                     src_width, abs_src_height, crop_width, inv_crop_height);
+      r = MJPGToARGB(sample, sample_size, crop_argb, argb_stride, src_width,
+                     abs_src_height, crop_width, inv_crop_height);
       break;
 #endif
     default:
@@ -291,16 +251,14 @@
 
   if (need_buf) {
     if (!r) {
-      r = ARGBRotate(crop_argb, argb_stride,
-                     dest_argb, dest_argb_stride,
+      r = ARGBRotate(crop_argb, argb_stride, dest_argb, dest_argb_stride,
                      crop_width, abs_crop_height, rotation);
     }
     free(rotate_buffer);
   } else if (rotation) {
     src = sample + (src_width * crop_y + crop_x) * 4;
-    r = ARGBRotate(src, src_width * 4,
-                   crop_argb, argb_stride,
-                   crop_width, inv_crop_height, rotation);
+    r = ARGBRotate(src, src_width * 4, crop_argb, argb_stride, crop_width,
+                   inv_crop_height, rotation);
   }
 
   return r;

diff --git a/files/source/convert_to_i420.cc b/files/source/convert_to_i420.cc
index e5f307c..a50689d 100644
--- a/files/source/convert_to_i420.cc
+++ b/files/source/convert_to_i420.cc

@@ -27,12 +27,18 @@
 LIBYUV_API
 int ConvertToI420(const uint8* sample,
                   size_t sample_size,
-                  uint8* y, int y_stride,
-                  uint8* u, int u_stride,
-                  uint8* v, int v_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+                  uint8* y,
+                  int y_stride,
+                  uint8* u,
+                  int u_stride,
+                  uint8* v,
+                  int v_stride,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
                   uint32 fourcc) {
   uint32 format = CanonicalFourCC(fourcc);
@@ -43,9 +49,10 @@
   // TODO(nisse): Why allow crop_height < 0?
   const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
   int r = 0;
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
-      format != FOURCC_NV12 && format != FOURCC_NV21 &&
-      format != FOURCC_YV12) || y == sample;
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
+       format != FOURCC_NV21 && format != FOURCC_YV12) ||
+      y == sample;
   uint8* tmp_y = y;
   uint8* tmp_u = u;
   uint8* tmp_v = v;
@@ -56,8 +63,7 @@
   const int inv_crop_height =
       (src_height < 0) ? -abs_crop_height : abs_crop_height;
 
-  if (!y || !u || !v || !sample ||
-      src_width <= 0 || crop_width <= 0  ||
+  if (!y || !u || !v || !sample || src_width <= 0 || crop_width <= 0 ||
       src_height == 0 || crop_height == 0) {
     return -1;
   }
@@ -70,7 +76,7 @@
   if (need_buf) {
     int y_size = crop_width * abs_crop_height;
     int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); /* NOLINT */
     if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
@@ -85,130 +91,85 @@
     // Single plane formats
     case FOURCC_YUY2:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = YUY2ToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
+                     v_stride, crop_width, inv_crop_height);
       break;
     case FOURCC_UYVY:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = UYVYToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
+                     v_stride, crop_width, inv_crop_height);
       break;
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToI420(src, src_width * 2,
-                       y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       crop_width, inv_crop_height);
+      r = RGB565ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
+                       v_stride, crop_width, inv_crop_height);
       break;
     case FOURCC_RGBO:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB1555ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
+                         v_stride, crop_width, inv_crop_height);
       break;
     case FOURCC_R444:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB4444ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
+                         v_stride, crop_width, inv_crop_height);
       break;
     case FOURCC_24BG:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToI420(src, src_width * 3,
-                      y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
+      r = RGB24ToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
                       crop_width, inv_crop_height);
       break;
     case FOURCC_RAW:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToI420(src, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
+      r = RAWToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
                     crop_width, inv_crop_height);
       break;
     case FOURCC_ARGB:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
+      r = ARGBToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_BGRA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
+      r = BGRAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_ABGR:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
+      r = ABGRToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_RGBA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
+      r = RGBAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
-      r = I400ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
+      r = I400ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
                      crop_width, inv_crop_height);
       break;
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
       src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           u, u_stride,
-                           v, v_stride,
-                           crop_width, inv_crop_height, rotation);
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
+                           y_stride, u, u_stride, v, v_stride, crop_width,
+                           inv_crop_height, rotation);
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
       src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
       // Call NV12 but with u and v parameters swapped.
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           v, v_stride,
-                           u, u_stride,
-                           crop_width, inv_crop_height, rotation);
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
+                           y_stride, v, v_stride, u, u_stride, crop_width,
+                           inv_crop_height, rotation);
       break;
     case FOURCC_M420:
       src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
+      r = M420ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
                      crop_width, inv_crop_height);
       break;
     // Triplanar formats
@@ -221,22 +182,18 @@
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
         src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       } else {
         src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       }
-      r = I420Rotate(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height, rotation);
+      r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
+                     y_stride, u, u_stride, v, v_stride, crop_width,
+                     inv_crop_height, rotation);
       break;
     }
     case FOURCC_I422:
@@ -246,23 +203,19 @@
       const uint8* src_v;
       int halfwidth = (src_width + 1) / 2;
       if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       }
-      r = I422ToI420(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
+                     y_stride, u, u_stride, v, v_stride, crop_width,
+                     inv_crop_height);
       break;
     }
     case FOURCC_I444:
@@ -277,37 +230,14 @@
         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
-      r = I444ToI420(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToI420(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, y,
+                     y_stride, u, u_stride, v, v_stride, crop_width,
+                     inv_crop_height);
       break;
     }
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
-      r = MJPGToI420(sample, sample_size,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
+      r = MJPGToI420(sample, sample_size, y, y_stride, u, u_stride, v, v_stride,
                      src_width, abs_src_height, crop_width, inv_crop_height);
       break;
 #endif
@@ -317,13 +247,9 @@
 
   if (need_buf) {
     if (!r) {
-      r = I420Rotate(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     tmp_y, tmp_y_stride,
-                     tmp_u, tmp_u_stride,
-                     tmp_v, tmp_v_stride,
-                     crop_width, abs_crop_height, rotation);
+      r = I420Rotate(y, y_stride, u, u_stride, v, v_stride, tmp_y, tmp_y_stride,
+                     tmp_u, tmp_u_stride, tmp_v, tmp_v_stride, crop_width,
+                     abs_crop_height, rotation);
     }
     free(rotate_buffer);
   }

diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
index 84927eb..9ff9326 100644
--- a/files/source/cpu_id.cc
+++ b/files/source/cpu_id.cc

@@ -13,7 +13,7 @@
 #if defined(_MSC_VER)
 #include <intrin.h>  // For __cpuidex()
 #endif
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                           \
     !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
     defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
@@ -44,8 +44,8 @@
 #endif
 
 // Low level cpuid for X86.
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER)
 LIBYUV_API
 void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
@@ -68,24 +68,24 @@
   if (info_ecx == 0) {
     __cpuid((int*)(cpu_info), info_eax);
   } else {
-    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
   }
 #endif
 // GCC version uses inline x86 assembly.
 #else  // defined(_MSC_VER)
   uint32 info_ebx, info_edx;
-  asm volatile (
-#if defined( __i386__) && defined(__PIC__)
-    // Preserve ebx for fpic 32 bit.
-    "mov %%ebx, %%edi                          \n"
-    "cpuid                                     \n"
-    "xchg %%edi, %%ebx                         \n"
-    : "=D" (info_ebx),
+  asm volatile(
+#if defined(__i386__) && defined(__PIC__)
+      // Preserve ebx for fpic 32 bit.
+      "mov %%ebx, %%edi                          \n"
+      "cpuid                                     \n"
+      "xchg %%edi, %%ebx                         \n"
+      : "=D"(info_ebx),
 #else
-    "cpuid                                     \n"
-    : "=b" (info_ebx),
+      "cpuid                                     \n"
+      : "=b"(info_ebx),
 #endif  //  defined( __i386__) && defined(__PIC__)
-      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+        "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
   cpu_info[0] = info_eax;
   cpu_info[1] = info_ebx;
   cpu_info[2] = info_ecx;
@@ -95,6 +95,8 @@
 #else  // (defined(_M_IX86) || defined(_M_X64) ...
 LIBYUV_API
 void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+  (void)eax;
+  (void)ecx;
   cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 }
 #endif
@@ -111,20 +113,22 @@
 #if defined(_M_IX86) && (_MSC_VER < 1900)
 #pragma optimize("g", off)
 #endif
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
-#define HAS_XGETBV
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
 int GetXCR0() {
   uint32 xcr0 = 0u;
 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
   xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
 #elif defined(__i386__) || defined(__x86_64__)
-  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
 #endif  // defined(__i386__) || defined(__x86_64__)
   return xcr0;
 }
+#else
+// xgetbv unavailable to query for OSSave support.  Return 0.
+#define GetXCR0() 0
 #endif  // defined(_M_IX86) || defined(_M_X64) ..
 // Return optimization to previous setting.
 #if defined(_M_IX86) && (_MSC_VER < 1900)
@@ -133,8 +137,7 @@
 
 // based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
-LIBYUV_API SAFEBUFFERS
-int ArmCpuCaps(const char* cpuinfo_name) {
+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
   FILE* f = fopen(cpuinfo_name, "r");
   if (!f) {
@@ -161,6 +164,38 @@
   return 0;
 }
 
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
+                                       const char ase[]) {
+  char cpuinfo_line[512];
+  int len = (int)strlen(ase);
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // ase enabled if /proc/cpuinfo is unavailable.
+    if (strcmp(ase, " msa") == 0) {
+      return kCpuHasMSA;
+    }
+    if (strcmp(ase, " dspr2") == 0) {
+      return kCpuHasDSPR2;
+    }
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+      char* p = strstr(cpuinfo_line, ase);
+      if (p && (p[len] == ' ' || p[len] == '\n')) {
+        fclose(f);
+        if (strcmp(ase, " msa") == 0) {
+          return kCpuHasMSA;
+        }
+        if (strcmp(ase, " dspr2") == 0) {
+          return kCpuHasDSPR2;
+        }
+      }
+    }
+  }
+  fclose(f);
+  return 0;
+}
+
 // CPU detect function for SIMD instruction sets.
 LIBYUV_API
 int cpu_info_ = 0;  // cpu_info is not initialized yet.
@@ -184,39 +219,35 @@
 }
 #endif
 
-LIBYUV_API SAFEBUFFERS
-int InitCpuFlags(void) {
-  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
+LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) {
   int cpu_info = 0;
 #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
-  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info0[4] = {0, 0, 0, 0};
+  uint32 cpu_info1[4] = {0, 0, 0, 0};
+  uint32 cpu_info7[4] = {0, 0, 0, 0};
   CpuId(0, 0, cpu_info0);
   CpuId(1, 0, cpu_info1);
   if (cpu_info0[0] >= 7) {
     CpuId(7, 0, cpu_info7);
   }
-  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+  cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
-             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-             kCpuHasX86;
+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
 
-#ifdef HAS_XGETBV
-  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
+  // AVX requires OS saves YMM registers.
   if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
       ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
-    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
+    cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
 
     // Detect AVX512bw
     if ((GetXCR0() & 0xe0) == 0xe0) {
       cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
     }
   }
-#endif
 
   // Environment variable overrides for testing.
   if (TestEnv("LIBYUV_DISABLE_X86")) {
@@ -249,15 +280,25 @@
   if (TestEnv("LIBYUV_DISABLE_AVX3")) {
     cpu_info &= ~kCpuHasAVX3;
   }
+  if (TestEnv("LIBYUV_DISABLE_F16C")) {
+    cpu_info &= ~kCpuHasF16C;
+  }
+
 #endif
 #if defined(__mips__) && defined(__linux__)
 #if defined(__mips_dspr2)
   cpu_info |= kCpuHasDSPR2;
 #endif
+#if defined(__mips_msa)
+  cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
+#endif
   cpu_info |= kCpuHasMIPS;
   if (getenv("LIBYUV_DISABLE_DSPR2")) {
     cpu_info &= ~kCpuHasDSPR2;
   }
+  if (getenv("LIBYUV_DISABLE_MSA")) {
+    cpu_info &= ~kCpuHasMSA;
+  }
 #endif
 #if defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__
@@ -283,7 +324,7 @@
   if (TestEnv("LIBYUV_DISABLE_ASM")) {
     cpu_info = 0;
   }
-  cpu_info  |= kCpuInitialized;
+  cpu_info |= kCpuInitialized;
   cpu_info_ = cpu_info;
   return cpu_info;
 }

diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc
index 5081841..b43c008 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/files/source/mjpeg_decoder.cc

@@ -21,7 +21,7 @@
 
 #if defined(_MSC_VER)
 // disable warning 4324: structure was padded due to __declspec(align())
-#pragma warning(disable:4324)
+#pragma warning(disable : 4324)
 #endif
 
 #endif
@@ -62,6 +62,7 @@
 void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT
 void term_source(jpeg_decompress_struct* cinfo);
 void ErrorHandler(jpeg_common_struct* cinfo);
+void OutputHandler(jpeg_common_struct* cinfo);
 
 MJpegDecoder::MJpegDecoder()
     : has_scanline_padding_(LIBYUV_FALSE),
@@ -77,6 +78,7 @@
   decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
   // Override standard exit()-based error handler.
   error_mgr_->base.error_exit = &ErrorHandler;
+  error_mgr_->base.output_message = &OutputHandler;
 #endif
   decompress_struct_->client_data = NULL;
   source_mgr_->init_source = &init_source;
@@ -127,7 +129,7 @@
       if (scanlines_[i]) {
         delete scanlines_[i];
       }
-      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_[i] = new uint8*[scanlines_size];
       scanlines_sizes_[i] = scanlines_size;
     }
 
@@ -193,13 +195,11 @@
 }
 
 int MJpegDecoder::GetHorizSubSampFactor(int component) {
-  return decompress_struct_->max_h_samp_factor /
-      GetHorizSampFactor(component);
+  return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
 }
 
 int MJpegDecoder::GetVertSubSampFactor(int component) {
-  return decompress_struct_->max_v_samp_factor /
-      GetVertSampFactor(component);
+  return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
 }
 
 int MJpegDecoder::GetImageScanlinesPerImcuRow() {
@@ -243,10 +243,10 @@
 }
 
 // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
-    uint8** planes, int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8** planes,
+                                          int dst_width,
+                                          int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
     // ERROR: Bad dimensions
     return LIBYUV_FALSE;
   }
@@ -287,14 +287,13 @@
       for (int i = 0; i < num_outbufs_; ++i) {
         // TODO(fbarchard): Compute skip to avoid this
         assert(skip % GetVertSubSampFactor(i) == 0);
-        int rows_to_skip =
-            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
-                                rows_to_skip;
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy =
+            GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
         int data_to_skip = rows_to_skip * GetComponentStride(i);
-        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
-                  planes[i], GetComponentWidth(i),
-                  GetComponentWidth(i), scanlines_to_copy);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
+                  GetComponentWidth(i), GetComponentWidth(i),
+                  scanlines_to_copy);
         planes[i] += scanlines_to_copy * GetComponentWidth(i);
       }
       lines_left -= (GetImageScanlinesPerImcuRow() - skip);
@@ -303,16 +302,15 @@
 
   // Read full MCUs but cropped horizontally
   for (; lines_left > GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
+       lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
       return LIBYUV_FALSE;
     }
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
@@ -326,19 +324,19 @@
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy =
           DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
   return FinishDecode();
 }
 
-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
-    int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
+                                           void* opaque,
+                                           int dst_width,
+                                           int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
     // ERROR: Bad dimensions
     return LIBYUV_FALSE;
   }
@@ -393,7 +391,7 @@
   }
   // Read full MCUs until we get to the crop point.
   for (; lines_left >= GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
+       lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
       return LIBYUV_FALSE;
@@ -433,22 +431,22 @@
 }
 
 void term_source(j_decompress_ptr cinfo) {
-  // Nothing to do.
+  (void)cinfo;  // Nothing to do.
 }
 
 #ifdef HAVE_SETJMP
 void ErrorHandler(j_common_ptr cinfo) {
-  // This is called when a jpeglib command experiences an error. Unfortunately
-  // jpeglib's error handling model is not very flexible, because it expects the
-  // error handler to not return--i.e., it wants the program to terminate. To
-  // recover from errors we use setjmp() as shown in their example. setjmp() is
-  // C's implementation for the "call with current continuation" functionality
-  // seen in some functional programming languages.
-  // A formatted message can be output, but is unsafe for release.
+// This is called when a jpeglib command experiences an error. Unfortunately
+// jpeglib's error handling model is not very flexible, because it expects the
+// error handler to not return--i.e., it wants the program to terminate. To
+// recover from errors we use setjmp() as shown in their example. setjmp() is
+// C's implementation for the "call with current continuation" functionality
+// seen in some functional programming languages.
+// A formatted message can be output, but is unsafe for release.
 #ifdef DEBUG
   char buf[JMSG_LENGTH_MAX];
   (*cinfo->err->format_message)(cinfo, buf);
-  // ERROR: Error in jpeglib: buf
+// ERROR: Error in jpeglib: buf
 #endif
 
   SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
@@ -456,7 +454,13 @@
   // and causes it to return (for a second time) with value 1.
   longjmp(mgr->setjmp_buffer, 1);
 }
-#endif
+
+// Suppress fprintf warnings.
+void OutputHandler(j_common_ptr cinfo) {
+  (void)cinfo;
+}
+
+#endif  // HAVE_SETJMP
 
 void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
   if (num_outbufs != num_outbufs_) {
@@ -465,9 +469,9 @@
     // it.
     DestroyOutputBuffers();
 
-    scanlines_ = new uint8** [num_outbufs];
+    scanlines_ = new uint8**[num_outbufs];
     scanlines_sizes_ = new int[num_outbufs];
-    databuf_ = new uint8* [num_outbufs];
+    databuf_ = new uint8*[num_outbufs];
     databuf_strides_ = new int[num_outbufs];
 
     for (int i = 0; i < num_outbufs; ++i) {
@@ -483,13 +487,13 @@
 
 void MJpegDecoder::DestroyOutputBuffers() {
   for (int i = 0; i < num_outbufs_; ++i) {
-    delete [] scanlines_[i];
-    delete [] databuf_[i];
+    delete[] scanlines_[i];
+    delete[] databuf_[i];
   }
-  delete [] scanlines_;
-  delete [] databuf_;
-  delete [] scanlines_sizes_;
-  delete [] databuf_strides_;
+  delete[] scanlines_;
+  delete[] databuf_;
+  delete[] scanlines_sizes_;
+  delete[] databuf_strides_;
   scanlines_ = NULL;
   databuf_ = NULL;
   scanlines_sizes_ = NULL;
@@ -535,26 +539,26 @@
 
 inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
   return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
-      jpeg_read_raw_data(decompress_struct_,
-                         scanlines_,
-                         GetImageScanlinesPerImcuRow());
+         jpeg_read_raw_data(decompress_struct_, scanlines_,
+                            GetImageScanlinesPerImcuRow());
 }
 
 // The helper function which recognizes the jpeg sub-sampling type.
 JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
-    int* subsample_x, int* subsample_y, int number_of_components) {
+    int* subsample_x,
+    int* subsample_y,
+    int number_of_components) {
   if (number_of_components == 3) {  // Color images.
-    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 2 &&
-        subsample_x[2] == 2 && subsample_y[2] == 2) {
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+        subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
       return kJpegYuv420;
     } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 1 &&
-        subsample_x[2] == 2 && subsample_y[2] == 1) {
+               subsample_x[1] == 2 && subsample_y[1] == 1 &&
+               subsample_x[2] == 2 && subsample_y[2] == 1) {
       return kJpegYuv422;
     } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 1 && subsample_y[1] == 1 &&
-        subsample_x[2] == 1 && subsample_y[2] == 1) {
+               subsample_x[1] == 1 && subsample_y[1] == 1 &&
+               subsample_x[2] == 1 && subsample_y[2] == 1) {
       return kJpegYuv444;
     }
   } else if (number_of_components == 1) {  // Grey-scale images.
@@ -567,4 +571,3 @@
 
 }  // namespace libyuv
 #endif  // HAVE_JPEG
-

diff --git a/files/source/mjpeg_validate.cc b/files/source/mjpeg_validate.cc
index 9c48832..1a17dd7 100644
--- a/files/source/mjpeg_validate.cc
+++ b/files/source/mjpeg_validate.cc

@@ -24,7 +24,7 @@
     const uint8* it = sample;
     while (it < end) {
       // TODO(fbarchard): scan for 0xd9 instead.
-      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
+      it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
       if (it == NULL) {
         break;
       }
@@ -68,4 +68,3 @@
 }  // extern "C"
 }  // namespace libyuv
 #endif
-

diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
index 237ab68..b8a53e8 100644
--- a/files/source/planar_functions.cc
+++ b/files/source/planar_functions.cc

@@ -26,14 +26,22 @@
 
 // Copy a plane of data
 LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+void CopyPlane(const uint8* src_y,
+               int src_stride_y,
+               uint8* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -76,15 +84,19 @@
   }
 }
 
+// TODO(fbarchard): Consider support for negative height.
+// TODO(fbarchard): Consider stride measured in bytes.
 LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height) {
+void CopyPlane_16(const uint16* src_y,
+                  int src_stride_y,
+                  uint16* dst_y,
+                  int dst_stride_y,
+                  int width,
+                  int height) {
   int y;
   void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -120,17 +132,22 @@
 
 // Copy I422.
 LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
+int I422Copy(const uint8* src_y,
+             int src_stride_y,
+             const uint8* src_u,
+             int src_stride_u,
+             const uint8* src_v,
+             int src_stride_v,
+             uint8* dst_y,
+             int dst_stride_y,
+             uint8* dst_u,
+             int dst_stride_u,
+             uint8* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
   int halfwidth = (width + 1) >> 1;
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -143,7 +160,10 @@
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
   }
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
   CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
   CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
   return 0;
@@ -151,16 +171,21 @@
 
 // Copy I444.
 LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+int I444Copy(const uint8* src_y,
+             int src_stride_y,
+             const uint8* src_u,
+             int src_stride_u,
+             const uint8* src_v,
+             int src_stride_v,
+             uint8* dst_y,
+             int dst_stride_y,
+             uint8* dst_u,
+             int dst_stride_u,
+             uint8* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -174,7 +199,9 @@
     src_stride_v = -src_stride_v;
   }
 
-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
   CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
   CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
   return 0;
@@ -182,9 +209,12 @@
 
 // Copy I400.
 LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int I400ToI400(const uint8* src_y,
+               int src_stride_y,
+               uint8* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -200,11 +230,20 @@
 
 // Convert I420 to I400.
 LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int I420ToI400(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  (void)src_u;
+  (void)src_stride_u;
+  (void)src_v;
+  (void)src_stride_v;
   if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -214,14 +253,159 @@
     src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
+
   CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   return 0;
 }
 
+// Support function for NV12 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane(const uint8* src_uv,
+                  int src_stride_uv,
+                  uint8* dst_u,
+                  int dst_stride_u,
+                  uint8* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height) {
+  int y;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) = SplitUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == width * 2 && dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_u, 4) &&
+      IS_ALIGNED(dst_stride_u, 4) && IS_ALIGNED(dst_v, 4) &&
+      IS_ALIGNED(dst_stride_v, 4)) {
+    SplitUVRow = SplitUVRow_Any_DSPR2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of UV.
+    SplitUVRow(src_uv, dst_u, dst_v, width);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+}
+
+LIBYUV_API
+void MergeUVPlane(const uint8* src_u,
+                  int src_stride_u,
+                  const uint8* src_v,
+                  int src_stride_v,
+                  uint8* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height) {
+  int y;
+  void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) = MergeUVRow_C;
+  // Coalesce rows.
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_u == width && src_stride_v == width &&
+      dst_stride_uv == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MergeUVRow = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow(src_u, src_v, dst_uv, width);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+}
+
 // Mirror a plane of data.
-void MirrorPlane(const uint8* src_y, int src_stride_y,
-                 uint8* dst_y, int dst_stride_y,
-                 int width, int height) {
+void MirrorPlane(const uint8* src_y,
+                 int src_stride_y,
+                 uint8* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height) {
   int y;
   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
   // Negative height means invert the image.
@@ -256,12 +440,20 @@
 #endif
 // TODO(fbarchard): Mirror on mips handle unaligned memory.
 #if defined(HAS_MIRRORROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_y, 4) &&
+      IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(dst_y, 4) &&
+      IS_ALIGNED(dst_stride_y, 4)) {
     MirrorRow = MirrorRow_DSPR2;
   }
 #endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
 
   // Mirror plane
   for (y = 0; y < height; ++y) {
@@ -273,17 +465,24 @@
 
 // Convert YUY2 to I422.
 LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int YUY2ToI422(const uint8* src_yuy2,
+               int src_stride_yuy2,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) =
-      YUY2ToUV422Row_C;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int width) = YUY2ToUV422Row_C;
   void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
       YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -291,10 +490,9 @@
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -322,15 +520,23 @@
 #if defined(HAS_YUY2TOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (width >= 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-    }
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       YUY2ToYRow = YUY2ToYRow_NEON;
       YUY2ToUV422Row = YUY2ToUV422Row_NEON;
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
@@ -345,17 +551,24 @@
 
 // Convert UYVY to I422.
 LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int UYVYToI422(const uint8* src_uyvy,
+               int src_stride_uyvy,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToUV422Row)(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) =
-      UYVYToUV422Row_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int width) = UYVYToYRow_C;
+  void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int width) = UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) =
+      UYVYToYRow_C;
+  if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -363,10 +576,9 @@
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -394,15 +606,23 @@
 #if defined(HAS_UYVYTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToYRow = UYVYToYRow_Any_NEON;
-    if (width >= 16) {
-      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
-    }
+    UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       UYVYToYRow = UYVYToYRow_NEON;
       UYVYToUV422Row = UYVYToUV422Row_NEON;
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUV422Row = UYVYToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
@@ -415,13 +635,82 @@
   return 0;
 }
 
+// Convert YUY2 to Y.
+LIBYUV_API
+int YUY2ToY(const uint8* src_yuy2,
+            int src_stride_yuy2,
+            uint8* dst_y,
+            int dst_stride_y,
+            int width,
+            int height) {
+  int y;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+      YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
 // Mirror I400 with optional flipping
 LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
+int I400Mirror(const uint8* src_y,
+               int src_stride_y,
+               uint8* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -437,17 +726,24 @@
 
 // Mirror I420 with optional flipping
 LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420Mirror(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -472,9 +768,12 @@
 
 // ARGB mirror.
 LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int ARGBMirror(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
   void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
       ARGBMirrorRow_C;
@@ -511,6 +810,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
 
   // Mirror plane
   for (y = 0; y < height; ++y) {
@@ -544,10 +851,14 @@
 
 // Alpha Blend 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
-              const uint8* src_argb1, int src_stride_argb1,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+int ARGBBlend(const uint8* src_argb0,
+              int src_stride_argb0,
+              const uint8* src_argb1,
+              int src_stride_argb1,
+              uint8* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
   int y;
   void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
                        uint8* dst_argb, int width) = GetARGBBlend();
@@ -561,8 +872,7 @@
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -580,14 +890,20 @@
 
 // Alpha Blend plane and store to destination.
 LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
-               const uint8* src_y1, int src_stride_y1,
-               const uint8* alpha, int alpha_stride,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int BlendPlane(const uint8* src_y0,
+               int src_stride_y0,
+               const uint8* src_y1,
+               int src_stride_y1,
+               const uint8* alpha,
+               int alpha_stride,
+               uint8* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
   void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+                        const uint8* alpha, uint8* dst, int width) =
+      BlendPlaneRow_C;
   if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -599,10 +915,8 @@
   }
 
   // Coalesce rows for Y plane.
-  if (src_stride_y0 == width &&
-      src_stride_y1 == width &&
-      alpha_stride == width &&
-      dst_stride_y == width) {
+  if (src_stride_y0 == width && src_stride_y1 == width &&
+      alpha_stride == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
@@ -610,7 +924,7 @@
 
 #if defined(HAS_BLENDPLANEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       BlendPlaneRow = BlendPlaneRow_SSSE3;
     }
@@ -618,7 +932,7 @@
 #endif
 #if defined(HAS_BLENDPLANEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       BlendPlaneRow = BlendPlaneRow_AVX2;
     }
@@ -638,22 +952,34 @@
 #define MAXTWIDTH 2048
 // Alpha Blend YUV images and store to destination.
 LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
-              const uint8* src_u0, int src_stride_u0,
-              const uint8* src_v0, int src_stride_v0,
-              const uint8* src_y1, int src_stride_y1,
-              const uint8* src_u1, int src_stride_u1,
-              const uint8* src_v1, int src_stride_v1,
-              const uint8* alpha, int alpha_stride,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height) {
+int I420Blend(const uint8* src_y0,
+              int src_stride_y0,
+              const uint8* src_u0,
+              int src_stride_u0,
+              const uint8* src_v0,
+              int src_stride_v0,
+              const uint8* src_y1,
+              int src_stride_y1,
+              const uint8* src_u1,
+              int src_stride_u1,
+              const uint8* src_v1,
+              int src_stride_v1,
+              const uint8* alpha,
+              int alpha_stride,
+              uint8* dst_y,
+              int dst_stride_y,
+              uint8* dst_u,
+              int dst_stride_u,
+              uint8* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
   int y;
   // Half width/height for UV.
   int halfwidth = (width + 1) >> 1;
   void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+                        const uint8* alpha, uint8* dst, int width) =
+      BlendPlaneRow_C;
   void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
   if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
@@ -669,11 +995,8 @@
   }
 
   // Blend Y plane.
-  BlendPlane(src_y0, src_stride_y0,
-             src_y1, src_stride_y1,
-             alpha, alpha_stride,
-             dst_y, dst_stride_y,
-             width, height);
+  BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,
+             dst_y, dst_stride_y, width, height);
 
 #if defined(HAS_BLENDPLANEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -753,10 +1076,14 @@
 
 // Multiply 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int ARGBMultiply(const uint8* src_argb0,
+                 int src_stride_argb0,
+                 const uint8* src_argb1,
+                 int src_stride_argb1,
+                 uint8* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
   void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
                           int width) = ARGBMultiplyRow_C;
@@ -770,8 +1097,7 @@
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -801,6 +1127,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBMULTIPLYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_MSA;
+    }
+  }
+#endif
 
   // Multiply plane
   for (y = 0; y < height; ++y) {
@@ -814,10 +1148,14 @@
 
 // Add 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
-            const uint8* src_argb1, int src_stride_argb1,
-            uint8* dst_argb, int dst_stride_argb,
-            int width, int height) {
+int ARGBAdd(const uint8* src_argb0,
+            int src_stride_argb0,
+            const uint8* src_argb1,
+            int src_stride_argb1,
+            uint8* dst_argb,
+            int dst_stride_argb,
+            int width,
+            int height) {
   int y;
   void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
                      int width) = ARGBAddRow_C;
@@ -831,8 +1169,7 @@
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -867,6 +1204,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBADDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAddRow = ARGBAddRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_MSA;
+    }
+  }
+#endif
 
   // Add plane
   for (y = 0; y < height; ++y) {
@@ -880,10 +1225,14 @@
 
 // Subtract 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int ARGBSubtract(const uint8* src_argb0,
+                 int src_stride_argb0,
+                 const uint8* src_argb1,
+                 int src_stride_argb1,
+                 uint8* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
   void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
                           int width) = ARGBSubtractRow_C;
@@ -897,8 +1246,7 @@
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -928,6 +1276,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBSUBTRACTROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_MSA;
+    }
+  }
+#endif
 
   // Subtract plane
   for (y = 0; y < height; ++y) {
@@ -939,21 +1295,23 @@
   return 0;
 }
 // Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_rgba, int dst_stride_rgba,
+static int I422ToRGBAMatrix(const uint8* src_y,
+                            int src_stride_y,
+                            const uint8* src_u,
+                            int src_stride_u,
+                            const uint8* src_v,
+                            int src_stride_v,
+                            uint8* dst_rgba,
+                            int dst_stride_rgba,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf,
+                        const uint8* v_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -995,6 +1353,14 @@
     I422ToRGBARow = I422ToRGBARow_DSPR2;
   }
 #endif
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@@ -1008,48 +1374,55 @@
 
 // Convert I422 to RGBA.
 LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants,
-                          width, height);
+int I422ToRGBA(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I422 to BGRA.
 LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_bgra, dst_stride_bgra,
+int I422ToBGRA(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert NV12 to RGB565.
 LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int NV12ToRGB565(const uint8* src_y,
+                 int src_stride_y,
+                 const uint8* src_uv,
+                 int src_stride_uv,
+                 uint8* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
-  void (*NV12ToRGB565Row)(const uint8* y_buf,
-                          const uint8* uv_buf,
-                          uint8* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) = NV12ToRGB565Row_C;
-  if (!src_y || !src_uv || !dst_rgb565 ||
-      width <= 0 || height == 0) {
+  void (*NV12ToRGB565Row)(
+      const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1082,6 +1455,14 @@
     }
   }
 #endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
@@ -1096,14 +1477,16 @@
 
 // Convert RAW to RGB24.
 LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
-               uint8* dst_rgb24, int dst_stride_rgb24,
-               int width, int height) {
+int RAWToRGB24(const uint8* src_raw,
+               int src_stride_raw,
+               uint8* dst_rgb24,
+               int dst_stride_rgb24,
+               int width,
+               int height) {
   int y;
   void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
       RAWToRGB24Row_C;
-  if (!src_raw || !dst_rgb24 ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1113,8 +1496,7 @@
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 &&
-      dst_stride_rgb24 == width * 3) {
+  if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_rgb24 = 0;
@@ -1135,6 +1517,14 @@
     }
   }
 #endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToRGB24Row = RAWToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -1145,11 +1535,13 @@
 }
 
 LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
-              int width, int height,
+void SetPlane(uint8* dst_y,
+              int dst_stride_y,
+              int width,
+              int height,
               uint32 value) {
   int y;
-  void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;
+  void (*SetRow)(uint8 * dst, uint8 value, int width) = SetRow_C;
   if (height < 0) {
     height = -height;
     dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1192,22 +1584,26 @@
 
 // Draw a rectangle into I420
 LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y,
-             int width, int height,
-             int value_y, int value_u, int value_v) {
+int I420Rect(uint8* dst_y,
+             int dst_stride_y,
+             uint8* dst_u,
+             int dst_stride_u,
+             uint8* dst_v,
+             int dst_stride_v,
+             int x,
+             int y,
+             int width,
+             int height,
+             int value_y,
+             int value_u,
+             int value_v) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   uint8* start_y = dst_y + y * dst_stride_y + x;
   uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
   uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
-  if (!dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0 ||
-      x < 0 || y < 0 ||
-      value_y < 0 || value_y > 255 ||
-      value_u < 0 || value_u > 255 ||
+  if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
+      y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
       value_v < 0 || value_v > 255) {
     return -1;
   }
@@ -1220,15 +1616,16 @@
 
 // Draw a rectangle into ARGB
 LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height,
+int ARGBRect(uint8* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height,
              uint32 value) {
   int y;
-  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;
-  if (!dst_argb ||
-      width <= 0 || height == 0 ||
-      dst_x < 0 || dst_y < 0) {
+  void (*ARGBSetRow)(uint8 * dst_argb, uint32 value, int width) = ARGBSetRow_C;
+  if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
   if (height < 0) {
@@ -1257,6 +1654,14 @@
     ARGBSetRow = ARGBSetRow_X86;
   }
 #endif
+#if defined(HAS_ARGBSETROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSetRow = ARGBSetRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_MSA;
+    }
+  }
+#endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
@@ -1280,12 +1685,15 @@
 //   f is foreground pixel premultiplied by alpha
 
 LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
+int ARGBAttenuate(const uint8* src_argb,
+                  int src_stride_argb,
+                  uint8* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   int y;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
-                           int width) = ARGBAttenuateRow_C;
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBAttenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1295,8 +1703,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1325,6 +1732,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -1336,9 +1751,12 @@
 
 // Convert preattentuated ARGB to unattenuated ARGB.
 LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height) {
+int ARGBUnattenuate(const uint8* src_argb,
+                    int src_stride_argb,
+                    uint8* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height) {
   int y;
   void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
                              int width) = ARGBUnattenuateRow_C;
@@ -1351,8 +1769,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1373,7 +1790,7 @@
     }
   }
 #endif
-// TODO(fbarchard): Neon version.
+  // TODO(fbarchard): Neon version.
 
   for (y = 0; y < height; ++y) {
     ARGBUnattenuateRow(src_argb, dst_argb, width);
@@ -1385,12 +1802,15 @@
 
 // Convert ARGB to Grayed ARGB.
 LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int ARGBGrayTo(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBGrayRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1400,8 +1820,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1416,6 +1835,11 @@
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBGrayRow(src_argb, dst_argb, width);
@@ -1427,12 +1851,15 @@
 
 // Make a rectangle of ARGB gray scale.
 LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height) {
+int ARGBGray(uint8* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height) {
   int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBGrayRow_C;
   uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
@@ -1453,6 +1880,12 @@
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBGrayRow(dst, dst, width);
     dst += dst_stride_argb;
@@ -1462,10 +1895,14 @@
 
 // Make a rectangle of ARGB Sepia tone.
 LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
-              int dst_x, int dst_y, int width, int height) {
+int ARGBSepia(uint8* dst_argb,
+              int dst_stride_argb,
+              int dst_x,
+              int dst_y,
+              int width,
+              int height) {
   int y;
-  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+  void (*ARGBSepiaRow)(uint8 * dst_argb, int width) = ARGBSepiaRow_C;
   uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
@@ -1486,6 +1923,12 @@
     ARGBSepiaRow = ARGBSepiaRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSEPIAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_MSA;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBSepiaRow(dst, width);
     dst += dst_stride_argb;
@@ -1496,13 +1939,17 @@
 // Apply a 4x4 matrix to each ARGB pixel.
 // Note: Normally for shading, but can be used to swizzle or invert.
 LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
+int ARGBColorMatrix(const uint8* src_argb,
+                    int src_stride_argb,
+                    uint8* dst_argb,
+                    int dst_stride_argb,
                     const int8* matrix_argb,
-                    int width, int height) {
+                    int width,
+                    int height) {
   int y;
   void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
-      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+                             const int8* matrix_argb, int width) =
+      ARGBColorMatrixRow_C;
   if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1512,8 +1959,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1539,13 +1985,17 @@
 // Apply a 4x3 matrix to each ARGB pixel.
 // Deprecated.
 LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+int RGBColorMatrix(uint8* dst_argb,
+                   int dst_stride_argb,
                    const int8* matrix_rgb,
-                   int dst_x, int dst_y, int width, int height) {
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
   SIMD_ALIGNED(int8 matrix_argb[16]);
   uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
 
@@ -1565,23 +2015,26 @@
   matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
   matrix_argb[15] = 64;  // 1.0
 
-  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
-                         dst, dst_stride_argb,
-                         &matrix_argb[0], width, height);
+  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, dst,
+                         dst_stride_argb, &matrix_argb[0], width, height);
 }
 
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+int ARGBColorTable(uint8* dst_argb,
+                   int dst_stride_argb,
                    const uint8* table_argb,
-                   int dst_x, int dst_y, int width, int height) {
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+  void (*ARGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb,
                             int width) = ARGBColorTableRow_C;
   uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
   // Coalesce rows.
@@ -1605,15 +2058,19 @@
 // Apply a color table each ARGB pixel but preserve destination alpha.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+int RGBColorTable(uint8* dst_argb,
+                  int dst_stride_argb,
                   const uint8* table_argb,
-                  int dst_x, int dst_y, int width, int height) {
+                  int dst_x,
+                  int dst_y,
+                  int width,
+                  int height) {
   int y;
-  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+  void (*RGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb,
                            int width) = RGBColorTableRow_C;
   uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
   // Coalesce rows.
@@ -1644,11 +2101,17 @@
 // Caveat - although SSE2 saturates, the C function does not and should be used
 // with care if doing anything but quantization.
 LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
-                 int scale, int interval_size, int interval_offset,
-                 int dst_x, int dst_y, int width, int height) {
+int ARGBQuantize(uint8* dst_argb,
+                 int dst_stride_argb,
+                 int scale,
+                 int interval_size,
+                 int interval_offset,
+                 int dst_x,
+                 int dst_y,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+  void (*ARGBQuantizeRow)(uint8 * dst_argb, int scale, int interval_size,
                           int interval_offset, int width) = ARGBQuantizeRow_C;
   uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
@@ -1681,12 +2144,16 @@
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
-                             int32* dst_cumsum, int dst_stride32_cumsum,
-                             int width, int height) {
+int ARGBComputeCumulativeSum(const uint8* src_argb,
+                             int src_stride_argb,
+                             int32* dst_cumsum,
+                             int dst_stride32_cumsum,
+                             int width,
+                             int height) {
   int y;
   void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+                                  const int32* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
   int32* previous_cumsum = dst_cumsum;
   if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
     return -1;
@@ -1711,15 +2178,22 @@
 // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
 // as the buffer is treated as circular.
 LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int32* dst_cumsum, int dst_stride32_cumsum,
-             int width, int height, int radius) {
+int ARGBBlur(const uint8* src_argb,
+             int src_stride_argb,
+             uint8* dst_argb,
+             int dst_stride_argb,
+             int32* dst_cumsum,
+             int dst_stride32_cumsum,
+             int width,
+             int height,
+             int radius) {
   int y;
-  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
   void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
-      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+                                    int width, int area, uint8* dst,
+                                    int count) = CumulativeSumToAverageRow_C;
   int32* cumsum_bot_row;
   int32* max_cumsum_bot_row;
   int32* cumsum_top_row;
@@ -1749,9 +2223,8 @@
 #endif
   // Compute enough CumulativeSum for first row to be blurred. After this
   // one row of CumulativeSum is updated at a time.
-  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
-                           dst_cumsum, dst_stride32_cumsum,
-                           width, radius);
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
+                           dst_stride32_cumsum, width, radius);
 
   src_argb = src_argb + radius * src_stride_argb;
   cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
@@ -1789,24 +2262,24 @@
 
     // Left clipped.
     for (x = 0; x < radius + 1; ++x) {
-      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                                boxwidth, area, &dst_argb[x * 4], 1);
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                                &dst_argb[x * 4], 1);
       area += (bot_y - top_y);
       boxwidth += 4;
     }
 
     // Middle unclipped.
     n = (width - 1) - radius - x + 1;
-    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                              boxwidth, area, &dst_argb[x * 4], n);
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                              &dst_argb[x * 4], n);
 
     // Right clipped.
     for (x += n; x <= width - 1; ++x) {
       area -= (bot_y - top_y);
       boxwidth -= 4;
       CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
-                                cumsum_bot_row + (x - radius - 1) * 4,
-                                boxwidth, area, &dst_argb[x * 4], 1);
+                                cumsum_bot_row + (x - radius - 1) * 4, boxwidth,
+                                area, &dst_argb[x * 4], 1);
     }
     dst_argb += dst_stride_argb;
   }
@@ -1815,12 +2288,16 @@
 
 // Multiply ARGB image by a specified ARGB value.
 LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value) {
+int ARGBShade(const uint8* src_argb,
+              int src_stride_argb,
+              uint8* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height,
+              uint32 value) {
   int y;
-  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
-                       int width, uint32 value) = ARGBShadeRow_C;
+  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) = ARGBShadeRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
     return -1;
   }
@@ -1830,8 +2307,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1846,6 +2322,11 @@
     ARGBShadeRow = ARGBShadeRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSHADEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_MSA;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBShadeRow(src_argb, dst_argb, width, value);
@@ -1857,12 +2338,17 @@
 
 // Interpolate 2 planes by specified amount (0 to 255).
 LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
-                     const uint8* src1, int src_stride1,
-                     uint8* dst, int dst_stride,
-                     int width, int height, int interpolation) {
+int InterpolatePlane(const uint8* src0,
+                     int src_stride0,
+                     const uint8* src1,
+                     int src_stride1,
+                     uint8* dst,
+                     int dst_stride,
+                     int width,
+                     int height,
+                     int interpolation) {
   int y;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -1875,9 +2361,7 @@
     dst_stride = -dst_stride;
   }
   // Coalesce rows.
-  if (src_stride0 == width &&
-      src_stride1 == width &&
-      dst_stride == width) {
+  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
     width *= height;
     height = 1;
     src_stride0 = src_stride1 = dst_stride = 0;
@@ -1907,14 +2391,21 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&
-      IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&
-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&
-      IS_ALIGNED(width, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src0, 4) &&
+      IS_ALIGNED(src_stride0, 4) && IS_ALIGNED(src1, 4) &&
+      IS_ALIGNED(src_stride1, 4) && IS_ALIGNED(dst, 4) &&
+      IS_ALIGNED(dst_stride, 4) && IS_ALIGNED(width, 4)) {
     InterpolateRow = InterpolateRow_DSPR2;
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -1927,61 +2418,71 @@
 
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
-                    const uint8* src_argb1, int src_stride_argb1,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int interpolation) {
-  return InterpolatePlane(src_argb0, src_stride_argb0,
-                          src_argb1, src_stride_argb1,
-                          dst_argb, dst_stride_argb,
+int ARGBInterpolate(const uint8* src_argb0,
+                    int src_stride_argb0,
+                    const uint8* src_argb1,
+                    int src_stride_argb1,
+                    uint8* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int interpolation) {
+  return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,
+                          src_stride_argb1, dst_argb, dst_stride_argb,
                           width * 4, height, interpolation);
 }
 
 // Interpolate 2 YUV images by specified amount (0 to 255).
 LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
-                    const uint8* src0_u, int src0_stride_u,
-                    const uint8* src0_v, int src0_stride_v,
-                    const uint8* src1_y, int src1_stride_y,
-                    const uint8* src1_u, int src1_stride_u,
-                    const uint8* src1_v, int src1_stride_v,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height, int interpolation) {
+int I420Interpolate(const uint8* src0_y,
+                    int src0_stride_y,
+                    const uint8* src0_u,
+                    int src0_stride_u,
+                    const uint8* src0_v,
+                    int src0_stride_v,
+                    const uint8* src1_y,
+                    int src1_stride_y,
+                    const uint8* src1_u,
+                    int src1_stride_u,
+                    const uint8* src1_v,
+                    int src1_stride_v,
+                    uint8* dst_y,
+                    int dst_stride_y,
+                    uint8* dst_u,
+                    int dst_stride_u,
+                    uint8* dst_v,
+                    int dst_stride_v,
+                    int width,
+                    int height,
+                    int interpolation) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src0_y || !src0_u || !src0_v ||
-      !src1_y || !src1_u || !src1_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
+      !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
-  InterpolatePlane(src0_y, src0_stride_y,
-                   src1_y, src1_stride_y,
-                   dst_y, dst_stride_y,
-                   width, height, interpolation);
-  InterpolatePlane(src0_u, src0_stride_u,
-                   src1_u, src1_stride_u,
-                   dst_u, dst_stride_u,
-                   halfwidth, halfheight, interpolation);
-  InterpolatePlane(src0_v, src0_stride_v,
-                   src1_v, src1_stride_v,
-                   dst_v, dst_stride_v,
-                   halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
+                   dst_stride_y, width, height, interpolation);
+  InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
+                   dst_stride_u, halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,
+                   dst_stride_v, halfwidth, halfheight, interpolation);
   return 0;
 }
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
-                uint8* dst_argb, int dst_stride_argb,
-                const uint8* shuffler, int width, int height) {
+int ARGBShuffle(const uint8* src_bgra,
+                int src_stride_bgra,
+                uint8* dst_argb,
+                int dst_stride_argb,
+                const uint8* shuffler,
+                int width,
+                int height) {
   int y;
   void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
                          const uint8* shuffler, int width) = ARGBShuffleRow_C;
-  if (!src_bgra || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1991,8 +2492,7 @@
     src_stride_bgra = -src_stride_bgra;
   }
   // Coalesce rows.
-  if (src_stride_bgra == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_bgra = dst_stride_argb = 0;
@@ -2029,6 +2529,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBSHUFFLEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
@@ -2039,28 +2547,32 @@
 }
 
 // Sobel ARGB effect.
-static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_argb, int dst_stride_argb,
-                        int width, int height,
+static int ARGBSobelize(const uint8* src_argb,
+                        int src_stride_argb,
+                        uint8* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height,
                         void (*SobelRow)(const uint8* src_sobelx,
                                          const uint8* src_sobely,
-                                         uint8* dst, int width)) {
+                                         uint8* dst,
+                                         int width)) {
   int y;
   void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
       ARGBToYJRow_C;
-  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely,
+                    int width) = SobelYRow_C;
   void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobely, int width) =
       SobelXRow_C;
   const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
-  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
@@ -2088,6 +2600,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
 
 #if defined(HAS_SOBELYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -2159,9 +2679,12 @@
 
 // Sobel ARGB effect.
 LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+int ARGBSobel(const uint8* src_argb,
+              int src_stride_argb,
+              uint8* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
   void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width) = SobelRow_C;
 #if defined(HAS_SOBELROW_SSE2)
@@ -2180,15 +2703,26 @@
     }
   }
 #endif
+#if defined(HAS_SOBELROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelRow = SobelRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_MSA;
+    }
+  }
+#endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height, SobelRow);
 }
 
 // Sobel ARGB effect with planar output.
 LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_y, int dst_stride_y,
-                     int width, int height) {
+int ARGBSobelToPlane(const uint8* src_argb,
+                     int src_stride_argb,
+                     uint8* dst_y,
+                     int dst_stride_y,
+                     int width,
+                     int height) {
   void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
                           uint8* dst_, int width) = SobelToPlaneRow_C;
 #if defined(HAS_SOBELTOPLANEROW_SSE2)
@@ -2207,16 +2741,27 @@
     }
   }
 #endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
-                      width, height, SobelToPlaneRow);
+#if defined(HAS_SOBELTOPLANEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SobelToPlaneRow = SobelToPlaneRow_MSA;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
+                      height, SobelToPlaneRow);
 }
 
 // SobelXY ARGB effect.
 // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
 LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
+int ARGBSobelXY(const uint8* src_argb,
+                int src_stride_argb,
+                uint8* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
   void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) = SobelXYRow_C;
 #if defined(HAS_SOBELXYROW_SSE2)
@@ -2235,32 +2780,41 @@
     }
   }
 #endif
+#if defined(HAS_SOBELXYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelXYRow = SobelXYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_MSA;
+    }
+  }
+#endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height, SobelXYRow);
 }
 
 // Apply a 4x4 polynomial to each ARGB pixel.
 LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb, int dst_stride_argb,
+int ARGBPolynomial(const uint8* src_argb,
+                   int src_stride_argb,
+                   uint8* dst_argb,
+                   int dst_stride_argb,
                    const float* poly,
-                   int width, int height) {
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBPolynomialRow)(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) = ARGBPolynomialRow_C;
+  void (*ARGBPolynomialRow)(const uint8* src_argb, uint8* dst_argb,
+                            const float* poly, int width) = ARGBPolynomialRow_C;
   if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2285,28 +2839,103 @@
   return 0;
 }
 
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16* src_y,
+                   int src_stride_y,
+                   uint16* dst_y,
+                   int dst_stride_y,
+                   float scale,
+                   int width,
+                   int height) {
+  int y;
+  void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) =
+      HalfFloatRow_C;
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  src_stride_y >>= 1;
+  dst_stride_y >>= 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_HALFFLOATROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    HalfFloatRow = HalfFloatRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = HalfFloatRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HalfFloatRow = HalfFloatRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = HalfFloatRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_F16C)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    HalfFloatRow(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
 // Apply a lumacolortable to each ARGB pixel.
 LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_argb, int dst_stride_argb,
+int ARGBLumaColorTable(const uint8* src_argb,
+                       int src_stride_argb,
+                       uint8* dst_argb,
+                       int dst_stride_argb,
                        const uint8* luma,
-                       int width, int height) {
+                       int width,
+                       int height) {
   int y;
-  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
-      int width, const uint8* luma, const uint32 lumacoeff) =
-      ARGBLumaColorTableRow_C;
+  void (*ARGBLumaColorTableRow)(
+      const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma,
+      const uint32 lumacoeff) = ARGBLumaColorTableRow_C;
   if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2327,9 +2956,12 @@
 
 // Copy Alpha from one ARGB image to another.
 LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
+int ARGBCopyAlpha(const uint8* src_argb,
+                  int src_stride_argb,
+                  uint8* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   int y;
   void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
       ARGBCopyAlphaRow_C;
@@ -2343,8 +2975,7 @@
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2376,9 +3007,12 @@
 
 // Extract just the alpha channel from ARGB.
 LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
-                     uint8* dst_a, int dst_stride,
-                     int width, int height) {
+int ARGBExtractAlpha(const uint8* src_argb,
+                     int src_stride,
+                     uint8* dst_a,
+                     int dst_stride,
+                     int width,
+                     int height) {
   if (!src_argb || !dst_a || width <= 0 || height == 0) {
     return -1;
   }
@@ -2394,7 +3028,7 @@
     height = 1;
     src_stride = dst_stride = 0;
   }
-  void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) =
+  void (*ARGBExtractAlphaRow)(const uint8* src_argb, uint8* dst_a, int width) =
       ARGBExtractAlphaRow_C;
 #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -2402,6 +3036,12 @@
                                                : ARGBExtractAlphaRow_Any_SSE2;
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+                                                : ARGBExtractAlphaRow_Any_AVX2;
+  }
+#endif
 #if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
@@ -2419,9 +3059,12 @@
 
 // Copy a planar Y channel to the alpha channel of a destination ARGB image.
 LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
-                     uint8* dst_argb, int dst_stride_argb,
-                     int width, int height) {
+int ARGBCopyYToAlpha(const uint8* src_y,
+                     int src_stride_y,
+                     uint8* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
   int y;
   void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
       ARGBCopyYToAlphaRow_C;
@@ -2435,8 +3078,7 @@
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -2470,20 +3112,22 @@
 // directly. A SplitUVRow_Odd function could copy the remaining chroma.
 
 LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int YUY2ToNV12(const uint8* src_yuy2,
+               int src_stride_yuy2,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
   void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                      int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src_yuy2 ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2540,6 +3184,14 @@
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
@@ -2568,20 +3220,22 @@
 }
 
 LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int UYVYToNV12(const uint8* src_uyvy,
+               int src_stride_uyvy,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
   void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                      int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src_uyvy ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2638,6 +3292,14 @@
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;

diff --git a/files/source/rotate.cc b/files/source/rotate.cc
index 01ea5c4..277c53b 100644
--- a/files/source/rotate.cc
+++ b/files/source/rotate.cc

@@ -22,12 +22,20 @@
 #endif
 
 LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void TransposePlane(const uint8* src,
+                    int src_stride,
+                    uint8* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   int i = height;
-  void (*TransposeWx8)(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  void (*TransposeWx16)(const uint8* src, int src_stride, uint8* dst,
+                        int dst_stride, int width) = TransposeWx16_C;
+#else
+  void (*TransposeWx8)(const uint8* src, int src_stride, uint8* dst,
+                       int dst_stride, int width) = TransposeWx8_C;
+#endif
 #if defined(HAS_TRANSPOSEWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeWx8 = TransposeWx8_NEON;
@@ -51,22 +59,40 @@
 #endif
 #if defined(HAS_TRANSPOSEWX8_DSPR2)
   if (TestCpuFlag(kCpuHasDSPR2)) {
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) &&
+        IS_ALIGNED(src_stride, 4)) {
       TransposeWx8 = TransposeWx8_Fast_DSPR2;
     } else {
       TransposeWx8 = TransposeWx8_DSPR2;
     }
   }
 #endif
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeWx16 = TransposeWx16_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx16 = TransposeWx16_MSA;
+    }
+  }
+#endif
 
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  // Work across the source in 16x16 tiles
+  while (i >= 16) {
+    TransposeWx16(src, src_stride, dst, dst_stride, width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst += 16;               // Move over 16 columns.
+    i -= 16;
+  }
+#else
   // Work across the source in 8x8 tiles
   while (i >= 8) {
     TransposeWx8(src, src_stride, dst, dst_stride, width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst += 8;                 // Move over 8 columns.
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst += 8;               // Move over 8 columns.
     i -= 8;
   }
+#endif
 
   if (i > 0) {
     TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
@@ -74,9 +100,12 @@
 }
 
 LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height) {
+void RotatePlane90(const uint8* src,
+                   int src_stride,
+                   uint8* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
   // Rotate by 90 is a transpose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
@@ -86,9 +115,12 @@
 }
 
 LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void RotatePlane270(const uint8* src,
+                    int src_stride,
+                    uint8* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   // Rotate by 270 is a transpose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
@@ -98,9 +130,12 @@
 }
 
 LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void RotatePlane180(const uint8* src,
+                    int src_stride,
+                    uint8* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width);
   const uint8* src_bot = src + src_stride * (height - 1);
@@ -135,12 +170,20 @@
 #endif
 // TODO(fbarchard): Mirror on mips handle unaligned memory.
 #if defined(HAS_MIRRORROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst, 4) &&
+      IS_ALIGNED(dst_stride, 4)) {
     MirrorRow = MirrorRow_DSPR2;
   }
 #endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -181,15 +224,24 @@
 }
 
 LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void TransposeUV(const uint8* src,
+                 int src_stride,
+                 uint8* dst_a,
+                 int dst_stride_a,
+                 uint8* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   int i = height;
-  void (*TransposeUVWx8)(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  void (*TransposeUVWx16)(const uint8* src, int src_stride, uint8* dst_a,
+                          int dst_stride_a, uint8* dst_b, int dst_stride_b,
+                          int width) = TransposeUVWx16_C;
+#else
+  void (*TransposeUVWx8)(const uint8* src, int src_stride, uint8* dst_a,
+                         int dst_stride_a, uint8* dst_b, int dst_stride_b,
                          int width) = TransposeUVWx8_C;
+#endif
 #if defined(HAS_TRANSPOSEUVWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeUVWx8 = TransposeUVWx8_NEON;
@@ -204,68 +256,92 @@
   }
 #endif
 #if defined(HAS_TRANSPOSEUVWX8_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) &&
+      IS_ALIGNED(src_stride, 4)) {
     TransposeUVWx8 = TransposeUVWx8_DSPR2;
   }
 #endif
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx16 = TransposeUVWx16_MSA;
+    }
+  }
+#endif
 
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  // Work through the source in 8x8 tiles.
+  while (i >= 16) {
+    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                    width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst_a += 16;             // Move over 8 columns.
+    dst_b += 16;             // Move over 8 columns.
+    i -= 16;
+  }
+#else
   // Work through the source in 8x8 tiles.
   while (i >= 8) {
-    TransposeUVWx8(src, src_stride,
-                   dst_a, dst_stride_a,
-                   dst_b, dst_stride_b,
+    TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
                    width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst_a += 8;               // Move over 8 columns.
-    dst_b += 8;               // Move over 8 columns.
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst_a += 8;             // Move over 8 columns.
+    dst_b += 8;             // Move over 8 columns.
     i -= 8;
   }
+#endif
 
   if (i > 0) {
-    TransposeUVWxH_C(src, src_stride,
-                     dst_a, dst_stride_a,
-                     dst_b, dst_stride_b,
+    TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
                      width, i);
   }
 }
 
 LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
-                uint8* dst_a, int dst_stride_a,
-                uint8* dst_b, int dst_stride_b,
-                int width, int height) {
+void RotateUV90(const uint8* src,
+                int src_stride,
+                uint8* dst_a,
+                int dst_stride_a,
+                uint8* dst_b,
+                int dst_stride_b,
+                int width,
+                int height) {
   src += src_stride * (height - 1);
   src_stride = -src_stride;
 
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
 }
 
 LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void RotateUV270(const uint8* src,
+                 int src_stride,
+                 uint8* dst_a,
+                 int dst_stride_a,
+                 uint8* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   dst_a += dst_stride_a * (width - 1);
   dst_b += dst_stride_b * (width - 1);
   dst_stride_a = -dst_stride_a;
   dst_stride_b = -dst_stride_b;
 
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
 }
 
 // Rotate 180 is a horizontal and vertical flip.
 LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void RotateUV180(const uint8* src,
+                 int src_stride,
+                 uint8* dst_a,
+                 int dst_stride_a,
+                 uint8* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   int i;
   void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
       MirrorUVRow_C;
@@ -280,8 +356,8 @@
   }
 #endif
 #if defined(HAS_MIRRORUVROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
+      IS_ALIGNED(src_stride, 4)) {
     MirrorUVRow = MirrorUVRow_DSPR2;
   }
 #endif
@@ -298,9 +374,12 @@
 }
 
 LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
-                uint8* dst, int dst_stride,
-                int width, int height,
+int RotatePlane(const uint8* src,
+                int src_stride,
+                uint8* dst,
+                int dst_stride,
+                int width,
+                int height,
                 enum RotationMode mode) {
   if (!src || width <= 0 || height == 0 || !dst) {
     return -1;
@@ -316,24 +395,16 @@
   switch (mode) {
     case kRotate0:
       // copy frame
-      CopyPlane(src, src_stride,
-                dst, dst_stride,
-                width, height);
+      CopyPlane(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate90:
-      RotatePlane90(src, src_stride,
-                    dst, dst_stride,
-                    width, height);
+      RotatePlane90(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate270:
-      RotatePlane270(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
+      RotatePlane270(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate180:
-      RotatePlane180(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
+      RotatePlane180(src, src_stride, dst, dst_stride, width, height);
       return 0;
     default:
       break;
@@ -342,18 +413,25 @@
 }
 
 LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height,
+int I420Rotate(const uint8* src_y,
+               int src_stride_y,
+               const uint8* src_u,
+               int src_stride_u,
+               const uint8* src_v,
+               int src_stride_v,
+               uint8* dst_y,
+               int dst_stride_y,
+               uint8* dst_u,
+               int dst_stride_u,
+               uint8* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
                enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
     return -1;
   }
 
@@ -372,45 +450,29 @@
   switch (mode) {
     case kRotate0:
       // copy frame
-      return I420Copy(src_y, src_stride_y,
-                      src_u, src_stride_u,
-                      src_v, src_stride_v,
-                      dst_y, dst_stride_y,
-                      dst_u, dst_stride_u,
-                      dst_v, dst_stride_v,
-                      width, height);
+      return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                      dst_v, dst_stride_v, width, height);
     case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotatePlane90(src_u, src_stride_u,
-                    dst_u, dst_stride_u,
-                    halfwidth, halfheight);
-      RotatePlane90(src_v, src_stride_v,
-                    dst_v, dst_stride_v,
-                    halfwidth, halfheight);
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                    halfheight);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                    halfheight);
       return 0;
     case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane270(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane270(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
       return 0;
     case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane180(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane180(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
       return 0;
     default:
       break;
@@ -419,17 +481,23 @@
 }
 
 LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-                     const uint8* src_uv, int src_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int width, int height,
+int NV12ToI420Rotate(const uint8* src_y,
+                     int src_stride_y,
+                     const uint8* src_uv,
+                     int src_stride_uv,
+                     uint8* dst_y,
+                     int dst_stride_y,
+                     uint8* dst_u,
+                     int dst_stride_u,
+                     uint8* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
                      enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+      !dst_v) {
     return -1;
   }
 
@@ -446,38 +514,23 @@
   switch (mode) {
     case kRotate0:
       // copy frame
-      return NV12ToI420(src_y, src_stride_y,
-                        src_uv, src_stride_uv,
-                        dst_y, dst_stride_y,
-                        dst_u, dst_stride_u,
-                        dst_v, dst_stride_v,
+      return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                        dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
                         width, height);
     case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotateUV90(src_uv, src_stride_uv,
-                 dst_u, dst_stride_u,
-                 dst_v, dst_stride_v,
-                 halfwidth, halfheight);
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                 dst_stride_v, halfwidth, halfheight);
       return 0;
     case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV270(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
       return 0;
     case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV180(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
       return 0;
     default:
       break;

diff --git a/files/source/rotate_any.cc b/files/source/rotate_any.cc
index 31a74c3..562096b 100644
--- a/files/source/rotate_any.cc
+++ b/files/source/rotate_any.cc

@@ -18,16 +18,16 @@
 extern "C" {
 #endif
 
-#define TANY(NAMEANY, TPOS_SIMD, MASK)                                         \
-    void NAMEANY(const uint8* src, int src_stride,                             \
-                 uint8* dst, int dst_stride, int width) {                      \
-      int r = width & MASK;                                                    \
-      int n = width - r;                                                       \
-      if (n > 0) {                                                             \
-        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
-      }                                                                        \
-      TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
-    }
+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \
+  void NAMEANY(const uint8* src, int src_stride, uint8* dst, int dst_stride,  \
+               int width) {                                                   \
+    int r = width & MASK;                                                     \
+    int n = width - r;                                                        \
+    if (n > 0) {                                                              \
+      TPOS_SIMD(src, src_stride, dst, dst_stride, n);                         \
+    }                                                                         \
+    TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
+  }
 
 #ifdef HAS_TRANSPOSEWX8_NEON
 TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
@@ -41,22 +41,22 @@
 #ifdef HAS_TRANSPOSEWX8_DSPR2
 TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
 #endif
+#ifdef HAS_TRANSPOSEWX16_MSA
+TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
+#endif
 #undef TANY
 
 #define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
-    void NAMEANY(const uint8* src, int src_stride,                             \
-                uint8* dst_a, int dst_stride_a,                                \
-                uint8* dst_b, int dst_stride_b, int width) {                   \
-      int r = width & MASK;                                                    \
-      int n = width - r;                                                       \
-      if (n > 0) {                                                             \
-        TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,   \
-                  n);                                                          \
-      }                                                                        \
-      TransposeUVWx8_C(src + n * 2, src_stride,                                \
-                       dst_a + n * dst_stride_a, dst_stride_a,                 \
-                       dst_b + n * dst_stride_b, dst_stride_b, r);             \
-    }
+  void NAMEANY(const uint8* src, int src_stride, uint8* dst_a,                 \
+               int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) {  \
+    int r = width & MASK;                                                      \
+    int n = width - r;                                                         \
+    if (n > 0) {                                                               \
+      TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
+    }                                                                          \
+    TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a,        \
+                     dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
+  }
 
 #ifdef HAS_TRANSPOSEUVWX8_NEON
 TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
@@ -67,14 +67,12 @@
 #ifdef HAS_TRANSPOSEUVWX8_DSPR2
 TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
 #endif
+#ifdef HAS_TRANSPOSEUVWX16_MSA
+TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
+#endif
 #undef TUVANY
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
-
-
-
-

diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc
index 787c0ad..b458d8f 100644
--- a/files/source/rotate_argb.cc
+++ b/files/source/rotate_argb.cc

@@ -22,29 +22,44 @@
 
 // ARGBScale has a function to copy pixels to a row, striding each source
 // pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || \
-    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) &&                          \
+    (defined(_M_IX86) ||                                     \
+     (defined(__x86_64__) && !defined(__native_client__)) || \
+     defined(__i386__))
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr,
+                               int src_stride,
+                               int src_stepx,
+                               uint8* dst_ptr,
+                               int dst_width);
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr,
+                               int src_stride,
+                               int src_stepx,
+                               uint8* dst_ptr,
+                               int dst_width);
 #endif
 
-void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx, uint8* dst_ptr, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_ptr,
+                            int,
+                            int src_stepx,
+                            uint8* dst_ptr,
+                            int dst_width);
 
-static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride, int width, int height) {
+static void ARGBTranspose(const uint8* src,
+                          int src_stride,
+                          uint8* dst,
+                          int dst_stride,
+                          int width,
+                          int height) {
   int i;
   int src_pixel_step = src_stride >> 2;
   void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
-      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+                               int src_step, uint8* dst_ptr, int dst_width) =
+      ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
@@ -63,8 +78,12 @@
   }
 }
 
-void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate90(const uint8* src,
+                  int src_stride,
+                  uint8* dst,
+                  int dst_stride,
+                  int width,
+                  int height) {
   // Rotate by 90 is a ARGBTranspose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
@@ -73,8 +92,12 @@
   ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
 }
 
-void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate270(const uint8* src,
+                   int src_stride,
+                   uint8* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
   // Rotate by 270 is a ARGBTranspose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
@@ -83,8 +106,12 @@
   ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
 }
 
-void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate180(const uint8* src,
+                   int src_stride,
+                   uint8* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width * 4);
   const uint8* src_bot = src + src_stride * (height - 1);
@@ -118,6 +145,14 @@
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -146,9 +181,9 @@
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
-    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
+    ARGBMirrorRow(src, row, width);      // Mirror first row into a buffer
     ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
-    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
+    CopyRow(row, dst_bot, width * 4);    // Copy first mirrored row into last
     src += src_stride;
     dst += dst_stride;
     src_bot -= src_stride;
@@ -158,8 +193,12 @@
 }
 
 LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb, int width, int height,
+int ARGBRotate(const uint8* src_argb,
+               int src_stride_argb,
+               uint8* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height,
                enum RotationMode mode) {
   if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
     return -1;
@@ -175,23 +214,19 @@
   switch (mode) {
     case kRotate0:
       // copy frame
-      return ARGBCopy(src_argb, src_stride_argb,
-                      dst_argb, dst_stride_argb,
+      return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height);
     case kRotate90:
-      ARGBRotate90(src_argb, src_stride_argb,
-                   dst_argb, dst_stride_argb,
-                   width, height);
+      ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                   height);
       return 0;
     case kRotate270:
-      ARGBRotate270(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
+      ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                    height);
       return 0;
     case kRotate180:
-      ARGBRotate180(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
+      ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                    height);
       return 0;
     default:
       break;

diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc
index b33a9a0..cdd231f 100644
--- a/files/source/rotate_common.cc
+++ b/files/source/rotate_common.cc

@@ -16,8 +16,11 @@
 extern "C" {
 #endif
 
-void TransposeWx8_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width) {
+void TransposeWx8_C(const uint8* src,
+                    int src_stride,
+                    uint8* dst,
+                    int dst_stride,
+                    int width) {
   int i;
   for (i = 0; i < width; ++i) {
     dst[0] = src[0 * src_stride];
@@ -33,9 +36,13 @@
   }
 }
 
-void TransposeUVWx8_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b, int width) {
+void TransposeUVWx8_C(const uint8* src,
+                      int src_stride,
+                      uint8* dst_a,
+                      int dst_stride_a,
+                      uint8* dst_b,
+                      int dst_stride_b,
+                      int width) {
   int i;
   for (i = 0; i < width; ++i) {
     dst_a[0] = src[0 * src_stride + 0];
@@ -60,9 +67,12 @@
   }
 }
 
-void TransposeWxH_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void TransposeWxH_C(const uint8* src,
+                    int src_stride,
+                    uint8* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   int i;
   for (i = 0; i < width; ++i) {
     int j;
@@ -72,10 +82,14 @@
   }
 }
 
-void TransposeUVWxH_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b,
-                      int width, int height) {
+void TransposeUVWxH_C(const uint8* src,
+                      int src_stride,
+                      uint8* dst_a,
+                      int dst_stride_a,
+                      uint8* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
   int i;
   for (i = 0; i < width * 2; i += 2) {
     int j;

diff --git a/files/source/rotate_mips.cc b/files/source/rotate_dspr2.cc
similarity index 65%
rename from files/source/rotate_mips.cc
rename to files/source/rotate_dspr2.cc
index 1e8ce25..2dce910 100644
--- a/files/source/rotate_mips.cc
+++ b/files/source/rotate_dspr2.cc

@@ -18,18 +18,20 @@
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
+    (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
 
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
-   __asm__ __volatile__ (
+void TransposeWx8_DSPR2(const uint8* src,
+                        int src_stride,
+                        uint8* dst,
+                        int dst_stride,
+                        int width) {
+  __asm__ __volatile__(
       ".set push                                         \n"
       ".set noreorder                                    \n"
-      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
-      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
-      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
+      "sll              $t2, %[src_stride], 0x1          \n"  // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
       "addu             $t3, $t2, %[src_stride]          \n"
       "addu             $t5, $t4, %[src_stride]          \n"
       "addu             $t6, $t2, $t4                    \n"
@@ -38,8 +40,8 @@
       "or               $t0, $t0, $t1                    \n"
       "bnez             $t0, 11f                         \n"
       " subu            $t7, $t9, %[src_stride]          \n"
-//dst + dst_stride word aligned
-    "1:                                                  \n"
+      // dst + dst_stride word aligned
+      "1:                                                \n"
       "lbu              $t0, 0(%[src])                   \n"
       "lbux             $t1, %[src_stride](%[src])       \n"
       "lbux             $t8, $t2(%[src])                 \n"
@@ -65,8 +67,8 @@
       "bnez             %[width], 1b                     \n"
       " addu            %[dst], %[dst], %[dst_stride]    \n"
       "b                2f                               \n"
-//dst + dst_stride unaligned
-   "11:                                                  \n"
+      // dst + dst_stride unaligned
+      "11:                                               \n"
       "lbu              $t0, 0(%[src])                   \n"
       "lbux             $t1, %[src_stride](%[src])       \n"
       "lbux             $t8, $t2(%[src])                 \n"
@@ -92,23 +94,20 @@
       "swr              $s1, 4(%[dst])                   \n"
       "swl              $s1, 7(%[dst])                   \n"
       "bnez             %[width], 11b                    \n"
-       "addu             %[dst], %[dst], %[dst_stride]   \n"
-    "2:                                                  \n"
+      "addu             %[dst], %[dst], %[dst_stride]    \n"
+      "2:                                                \n"
       ".set pop                                          \n"
-      :[src] "+r" (src),
-       [dst] "+r" (dst),
-       [width] "+r" (width)
-      :[src_stride] "r" (src_stride),
-       [dst_stride] "r" (dst_stride)
-      : "t0", "t1",  "t2", "t3", "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1"
-  );
+      : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
+      : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
 }
 
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
-  __asm__ __volatile__ (
+void TransposeWx8_Fast_DSPR2(const uint8* src,
+                             int src_stride,
+                             uint8* dst,
+                             int dst_stride,
+                             int width) {
+  __asm__ __volatile__(
       ".set noat                                         \n"
       ".set push                                         \n"
       ".set noreorder                                    \n"
@@ -126,67 +125,67 @@
       "or               $t0, $t0, $t1                    \n"
       "bnez             $t0, 11f                         \n"
       " subu            $t7, $t9, %[src_stride]          \n"
-//dst + dst_stride word aligned
+      // dst + dst_stride word aligned
       "1:                                                \n"
       "lw               $t0, 0(%[src])                   \n"
       "lwx              $t1, %[src_stride](%[src])       \n"
       "lwx              $t8, $t2(%[src])                 \n"
       "lwx              $t9, $t3(%[src])                 \n"
 
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
+      // t0 = | 30 | 20 | 10 | 00 |
+      // t1 = | 31 | 21 | 11 | 01 |
+      // t8 = | 32 | 22 | 12 | 02 |
+      // t9 = | 33 | 23 | 13 | 03 |
 
       "precr.qb.ph     $s0, $t1, $t0                     \n"
       "precr.qb.ph     $s1, $t9, $t8                     \n"
       "precrq.qb.ph    $s2, $t1, $t0                     \n"
       "precrq.qb.ph    $s3, $t9, $t8                     \n"
 
-  // s0 = | 21 | 01 | 20 | 00 |
-  // s1 = | 23 | 03 | 22 | 02 |
-  // s2 = | 31 | 11 | 30 | 10 |
-  // s3 = | 33 | 13 | 32 | 12 |
+      // s0 = | 21 | 01 | 20 | 00 |
+      // s1 = | 23 | 03 | 22 | 02 |
+      // s2 = | 31 | 11 | 30 | 10 |
+      // s3 = | 33 | 13 | 32 | 12 |
 
       "precr.qb.ph     $s4, $s1, $s0                     \n"
       "precrq.qb.ph    $s5, $s1, $s0                     \n"
       "precr.qb.ph     $s6, $s3, $s2                     \n"
       "precrq.qb.ph    $s7, $s3, $s2                     \n"
 
-  // s4 = | 03 | 02 | 01 | 00 |
-  // s5 = | 23 | 22 | 21 | 20 |
-  // s6 = | 13 | 12 | 11 | 10 |
-  // s7 = | 33 | 32 | 31 | 30 |
+      // s4 = | 03 | 02 | 01 | 00 |
+      // s5 = | 23 | 22 | 21 | 20 |
+      // s6 = | 13 | 12 | 11 | 10 |
+      // s7 = | 33 | 32 | 31 | 30 |
 
       "lwx              $t0, $t4(%[src])                 \n"
       "lwx              $t1, $t5(%[src])                 \n"
       "lwx              $t8, $t6(%[src])                 \n"
       "lwx              $t9, $t7(%[src])                 \n"
 
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
+      // t0 = | 34 | 24 | 14 | 04 |
+      // t1 = | 35 | 25 | 15 | 05 |
+      // t8 = | 36 | 26 | 16 | 06 |
+      // t9 = | 37 | 27 | 17 | 07 |
 
       "precr.qb.ph     $s0, $t1, $t0                     \n"
       "precr.qb.ph     $s1, $t9, $t8                     \n"
       "precrq.qb.ph    $s2, $t1, $t0                     \n"
       "precrq.qb.ph    $s3, $t9, $t8                     \n"
 
-  // s0 = | 25 | 05 | 24 | 04 |
-  // s1 = | 27 | 07 | 26 | 06 |
-  // s2 = | 35 | 15 | 34 | 14 |
-  // s3 = | 37 | 17 | 36 | 16 |
+      // s0 = | 25 | 05 | 24 | 04 |
+      // s1 = | 27 | 07 | 26 | 06 |
+      // s2 = | 35 | 15 | 34 | 14 |
+      // s3 = | 37 | 17 | 36 | 16 |
 
       "precr.qb.ph     $t0, $s1, $s0                     \n"
       "precrq.qb.ph    $t1, $s1, $s0                     \n"
       "precr.qb.ph     $t8, $s3, $s2                     \n"
       "precrq.qb.ph    $t9, $s3, $s2                     \n"
 
-  // t0 = | 07 | 06 | 05 | 04 |
-  // t1 = | 27 | 26 | 25 | 24 |
-  // t8 = | 17 | 16 | 15 | 14 |
-  // t9 = | 37 | 36 | 35 | 34 |
+      // t0 = | 07 | 06 | 05 | 04 |
+      // t1 = | 27 | 26 | 25 | 24 |
+      // t8 = | 17 | 16 | 15 | 14 |
+      // t9 = | 37 | 36 | 35 | 34 |
 
       "addu            $s0, %[dst], %[dst_stride]        \n"
       "addu            $s1, $s0, %[dst_stride]           \n"
@@ -207,67 +206,67 @@
       "bnez             $AT, 1b                          \n"
       " addu            %[dst], $s2, %[dst_stride]       \n"
       "b                2f                               \n"
-//dst + dst_stride unaligned
+      // dst + dst_stride unaligned
       "11:                                               \n"
       "lw               $t0, 0(%[src])                   \n"
       "lwx              $t1, %[src_stride](%[src])       \n"
       "lwx              $t8, $t2(%[src])                 \n"
       "lwx              $t9, $t3(%[src])                 \n"
 
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
+      // t0 = | 30 | 20 | 10 | 00 |
+      // t1 = | 31 | 21 | 11 | 01 |
+      // t8 = | 32 | 22 | 12 | 02 |
+      // t9 = | 33 | 23 | 13 | 03 |
 
       "precr.qb.ph     $s0, $t1, $t0                     \n"
       "precr.qb.ph     $s1, $t9, $t8                     \n"
       "precrq.qb.ph    $s2, $t1, $t0                     \n"
       "precrq.qb.ph    $s3, $t9, $t8                     \n"
 
-  // s0 = | 21 | 01 | 20 | 00 |
-  // s1 = | 23 | 03 | 22 | 02 |
-  // s2 = | 31 | 11 | 30 | 10 |
-  // s3 = | 33 | 13 | 32 | 12 |
+      // s0 = | 21 | 01 | 20 | 00 |
+      // s1 = | 23 | 03 | 22 | 02 |
+      // s2 = | 31 | 11 | 30 | 10 |
+      // s3 = | 33 | 13 | 32 | 12 |
 
       "precr.qb.ph     $s4, $s1, $s0                     \n"
       "precrq.qb.ph    $s5, $s1, $s0                     \n"
       "precr.qb.ph     $s6, $s3, $s2                     \n"
       "precrq.qb.ph    $s7, $s3, $s2                     \n"
 
-  // s4 = | 03 | 02 | 01 | 00 |
-  // s5 = | 23 | 22 | 21 | 20 |
-  // s6 = | 13 | 12 | 11 | 10 |
-  // s7 = | 33 | 32 | 31 | 30 |
+      // s4 = | 03 | 02 | 01 | 00 |
+      // s5 = | 23 | 22 | 21 | 20 |
+      // s6 = | 13 | 12 | 11 | 10 |
+      // s7 = | 33 | 32 | 31 | 30 |
 
       "lwx              $t0, $t4(%[src])                 \n"
       "lwx              $t1, $t5(%[src])                 \n"
       "lwx              $t8, $t6(%[src])                 \n"
       "lwx              $t9, $t7(%[src])                 \n"
 
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
+      // t0 = | 34 | 24 | 14 | 04 |
+      // t1 = | 35 | 25 | 15 | 05 |
+      // t8 = | 36 | 26 | 16 | 06 |
+      // t9 = | 37 | 27 | 17 | 07 |
 
       "precr.qb.ph     $s0, $t1, $t0                     \n"
       "precr.qb.ph     $s1, $t9, $t8                     \n"
       "precrq.qb.ph    $s2, $t1, $t0                     \n"
       "precrq.qb.ph    $s3, $t9, $t8                     \n"
 
-  // s0 = | 25 | 05 | 24 | 04 |
-  // s1 = | 27 | 07 | 26 | 06 |
-  // s2 = | 35 | 15 | 34 | 14 |
-  // s3 = | 37 | 17 | 36 | 16 |
+      // s0 = | 25 | 05 | 24 | 04 |
+      // s1 = | 27 | 07 | 26 | 06 |
+      // s2 = | 35 | 15 | 34 | 14 |
+      // s3 = | 37 | 17 | 36 | 16 |
 
       "precr.qb.ph     $t0, $s1, $s0                     \n"
       "precrq.qb.ph    $t1, $s1, $s0                     \n"
       "precr.qb.ph     $t8, $s3, $s2                     \n"
       "precrq.qb.ph    $t9, $s3, $s2                     \n"
 
-  // t0 = | 07 | 06 | 05 | 04 |
-  // t1 = | 27 | 26 | 25 | 24 |
-  // t8 = | 17 | 16 | 15 | 14 |
-  // t9 = | 37 | 36 | 35 | 34 |
+      // t0 = | 07 | 06 | 05 | 04 |
+      // t1 = | 27 | 26 | 25 | 24 |
+      // t8 = | 17 | 16 | 15 | 14 |
+      // t9 = | 37 | 36 | 35 | 34 |
 
       "addu            $s0, %[dst], %[dst_stride]        \n"
       "addu            $s1, $s0, %[dst_stride]           \n"
@@ -298,34 +297,33 @@
       "2:                                                \n"
       ".set pop                                          \n"
       ".set at                                           \n"
-      :[src] "+r" (src),
-       [dst] "+r" (dst),
-       [width] "+r" (width)
-      :[src_stride] "r" (src_stride),
-       [dst_stride] "r" (dst_stride)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
-  );
+      : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
+      : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
+        "s2", "s3", "s4", "s5", "s6", "s7");
 }
 
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
-                          uint8* dst_a, int dst_stride_a,
-                          uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_DSPR2(const uint8* src,
+                          int src_stride,
+                          uint8* dst_a,
+                          int dst_stride_a,
+                          uint8* dst_b,
+                          int dst_stride_b,
                           int width) {
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       ".set push                                         \n"
       ".set noreorder                                    \n"
       "beqz            %[width], 2f                      \n"
-      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
-      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
-      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
+      " sll            $t2, %[src_stride], 0x1           \n"  // src_stride x 2
+      "sll             $t4, %[src_stride], 0x2           \n"  // src_stride x 4
+      "sll             $t9, %[src_stride], 0x3           \n"  // src_stride x 8
       "addu            $t3, $t2, %[src_stride]           \n"
       "addu            $t5, $t4, %[src_stride]           \n"
       "addu            $t6, $t2, $t4                     \n"
       "subu            $t7, $t9, %[src_stride]           \n"
       "srl             $t1, %[width], 1                  \n"
 
-// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+      // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
       "andi            $t0, %[dst_a], 0x3                \n"
       "andi            $t8, %[dst_b], 0x3                \n"
       "or              $t0, $t0, $t8                     \n"
@@ -335,52 +333,52 @@
       "or              $t0, $t0, $t8                     \n"
       "bnez            $t0, 11f                          \n"
       " nop                                              \n"
-// dst + dst_stride word aligned (both, a & b dst addresses)
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      // dst + dst_stride word aligned (both, a & b dst addresses)
+      "1:                                                \n"
+      "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
       "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
       "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
 
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
 
       "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
       "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+      "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
 
       "sw              $s3, 0($s5)                       \n"
       "sw              $s4, 0($s6)                       \n"
 
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
 
-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
       "sw              $s3, 0(%[dst_a])                  \n"
       "sw              $s4, 0(%[dst_b])                  \n"
 
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
 
       "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
       "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
       "sw              $s3, 4($s5)                       \n"
       "sw              $s4, 4($s6)                       \n"
 
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
 
       "addiu           %[src], 4                         \n"
       "addiu           $t1, -1                           \n"
@@ -394,59 +392,59 @@
       "b               2f                                \n"
       " nop                                              \n"
 
-// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
-   "11:                                                  \n"
-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+      "11:                                               \n"
+      "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
       "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
       "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
 
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
 
       "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
       "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+      "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
 
       "swr             $s3, 0($s5)                       \n"
       "swl             $s3, 3($s5)                       \n"
       "swr             $s4, 0($s6)                       \n"
       "swl             $s4, 3($s6)                       \n"
 
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
 
-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
       "swr             $s3, 0(%[dst_a])                  \n"
       "swl             $s3, 3(%[dst_a])                  \n"
       "swr             $s4, 0(%[dst_b])                  \n"
       "swl             $s4, 3(%[dst_b])                  \n"
 
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
 
       "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
       "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
 
       "swr             $s3, 4($s5)                       \n"
       "swl             $s3, 7($s5)                       \n"
       "swr             $s4, 4($s6)                       \n"
       "swl             $s4, 7($s6)                       \n"
 
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
 
       "addiu           %[src], 4                         \n"
       "addiu           $t1, -1                           \n"
@@ -462,18 +460,11 @@
 
       "2:                                                \n"
       ".set pop                                          \n"
-      : [src] "+r" (src),
-        [dst_a] "+r" (dst_a),
-        [dst_b] "+r" (dst_b),
-        [width] "+r" (width),
-        [src_stride] "+r" (src_stride)
-      : [dst_stride_a] "r" (dst_stride_a),
-        [dst_stride_b] "r" (dst_stride_b)
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3",
-        "s4", "s5", "s6"
-  );
+      : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
+        [width] "+r"(width), [src_stride] "+r"(src_stride)
+      : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
+        "s2", "s3", "s4", "s5", "s6");
 }
 
 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)

diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc
index cbe870c..85b41dd 100644
--- a/files/source/rotate_gcc.cc
+++ b/files/source/rotate_gcc.cc

@@ -22,342 +22,348 @@
 
 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
 #if defined(HAS_TRANSPOSEWX8_SSSE3)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movq       (%0),%%xmm0                      \n"
-    "movq       (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "movq       (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "movq       (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movq       (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "movq       (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movq       (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "lea        0x8(%0,%3,8),%0                  \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "sub        $0x8,%2                          \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void TransposeWx8_SSSE3(const uint8* src,
+                        int src_stride,
+                        uint8* dst,
+                        int dst_stride,
+                        int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movq       (%0),%%xmm0                      \n"
+      "movq       (%0,%3),%%xmm1                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "movq       (%0),%%xmm2                      \n"
+      "movdqa     %%xmm0,%%xmm1                    \n"
+      "palignr    $0x8,%%xmm1,%%xmm1               \n"
+      "movq       (%0,%3),%%xmm3                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm3                    \n"
+      "movq       (%0),%%xmm4                      \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "movq       (%0,%3),%%xmm5                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "movdqa     %%xmm4,%%xmm5                    \n"
+      "movq       (%0),%%xmm6                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       (%0,%3),%%xmm7                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "neg        %3                               \n"
+      "movdqa     %%xmm6,%%xmm7                    \n"
+      "lea        0x8(%0,%3,8),%0                  \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "neg        %3                               \n"
+      // Second round of bit swap.
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm0,%%xmm2                    \n"
+      "movdqa     %%xmm1,%%xmm3                    \n"
+      "palignr    $0x8,%%xmm2,%%xmm2               \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm4,%%xmm6                    \n"
+      "movdqa     %%xmm5,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movq       %%xmm0,(%1)                      \n"
+      "movdqa     %%xmm0,%%xmm4                    \n"
+      "palignr    $0x8,%%xmm4,%%xmm4               \n"
+      "movq       %%xmm4,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm6                    \n"
+      "movq       %%xmm2,(%1)                      \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movq       %%xmm6,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm1,%%xmm5                    \n"
+      "movq       %%xmm1,(%1)                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       %%xmm5,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movq       %%xmm3,(%1)                      \n"
+      "movdqa     %%xmm3,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "sub        $0x8,%2                          \n"
+      "movq       %%xmm7,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
 
 // Transpose 16x8. 64 bit
 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     (%0),%%xmm0                      \n"
-    "movdqu     (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "punpckhbw  %%xmm1,%%xmm8                    \n"
-    "movdqu     (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "movdqa     %%xmm8,%%xmm9                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "palignr    $0x8,%%xmm9,%%xmm9               \n"
-    "movdqu     (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm2,%%xmm10                   \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "punpckhbw  %%xmm3,%%xmm10                   \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movdqa     %%xmm10,%%xmm11                  \n"
-    "movdqu     (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "palignr    $0x8,%%xmm11,%%xmm11             \n"
-    "movdqu     (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm4,%%xmm12                   \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "punpckhbw  %%xmm5,%%xmm12                   \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movdqa     %%xmm12,%%xmm13                  \n"
-    "movdqu     (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "palignr    $0x8,%%xmm13,%%xmm13             \n"
-    "movdqu     (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm6,%%xmm14                   \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "punpckhbw  %%xmm7,%%xmm14                   \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "movdqa     %%xmm14,%%xmm15                  \n"
-    "lea        0x10(%0,%3,8),%0                 \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "punpcklwd  %%xmm10,%%xmm8                   \n"
-    "punpcklwd  %%xmm11,%%xmm9                   \n"
-    "movdqa     %%xmm8,%%xmm10                   \n"
-    "movdqa     %%xmm9,%%xmm11                   \n"
-    "palignr    $0x8,%%xmm10,%%xmm10             \n"
-    "palignr    $0x8,%%xmm11,%%xmm11             \n"
-    "punpcklwd  %%xmm14,%%xmm12                  \n"
-    "punpcklwd  %%xmm15,%%xmm13                  \n"
-    "movdqa     %%xmm12,%%xmm14                  \n"
-    "movdqa     %%xmm13,%%xmm15                  \n"
-    "palignr    $0x8,%%xmm14,%%xmm14             \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm12,%%xmm8                   \n"
-    "movq       %%xmm8,(%1)                      \n"
-    "movdqa     %%xmm8,%%xmm12                   \n"
-    "palignr    $0x8,%%xmm12,%%xmm12             \n"
-    "movq       %%xmm12,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm14,%%xmm10                  \n"
-    "movdqa     %%xmm10,%%xmm14                  \n"
-    "movq       %%xmm10,(%1)                     \n"
-    "palignr    $0x8,%%xmm14,%%xmm14             \n"
-    "punpckldq  %%xmm13,%%xmm9                   \n"
-    "movq       %%xmm14,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm9,%%xmm13                   \n"
-    "movq       %%xmm9,(%1)                      \n"
-    "palignr    $0x8,%%xmm13,%%xmm13             \n"
-    "movq       %%xmm13,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm15,%%xmm11                  \n"
-    "movq       %%xmm11,(%1)                     \n"
-    "movdqa     %%xmm11,%%xmm15                  \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    "sub        $0x10,%2                         \n"
-    "movq       %%xmm15,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
-  );
+void TransposeWx8_Fast_SSSE3(const uint8* src,
+                             int src_stride,
+                             uint8* dst,
+                             int dst_stride,
+                             int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movdqu     (%0),%%xmm0                      \n"
+      "movdqu     (%0,%3),%%xmm1                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "punpckhbw  %%xmm1,%%xmm8                    \n"
+      "movdqu     (%0),%%xmm2                      \n"
+      "movdqa     %%xmm0,%%xmm1                    \n"
+      "movdqa     %%xmm8,%%xmm9                    \n"
+      "palignr    $0x8,%%xmm1,%%xmm1               \n"
+      "palignr    $0x8,%%xmm9,%%xmm9               \n"
+      "movdqu     (%0,%3),%%xmm3                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm2,%%xmm10                   \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "punpckhbw  %%xmm3,%%xmm10                   \n"
+      "movdqa     %%xmm2,%%xmm3                    \n"
+      "movdqa     %%xmm10,%%xmm11                  \n"
+      "movdqu     (%0),%%xmm4                      \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "palignr    $0x8,%%xmm11,%%xmm11             \n"
+      "movdqu     (%0,%3),%%xmm5                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm4,%%xmm12                   \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "punpckhbw  %%xmm5,%%xmm12                   \n"
+      "movdqa     %%xmm4,%%xmm5                    \n"
+      "movdqa     %%xmm12,%%xmm13                  \n"
+      "movdqu     (%0),%%xmm6                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "palignr    $0x8,%%xmm13,%%xmm13             \n"
+      "movdqu     (%0,%3),%%xmm7                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm6,%%xmm14                   \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "punpckhbw  %%xmm7,%%xmm14                   \n"
+      "neg        %3                               \n"
+      "movdqa     %%xmm6,%%xmm7                    \n"
+      "movdqa     %%xmm14,%%xmm15                  \n"
+      "lea        0x10(%0,%3,8),%0                 \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      "neg        %3                               \n"
+      // Second round of bit swap.
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm0,%%xmm2                    \n"
+      "movdqa     %%xmm1,%%xmm3                    \n"
+      "palignr    $0x8,%%xmm2,%%xmm2               \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm4,%%xmm6                    \n"
+      "movdqa     %%xmm5,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "punpcklwd  %%xmm10,%%xmm8                   \n"
+      "punpcklwd  %%xmm11,%%xmm9                   \n"
+      "movdqa     %%xmm8,%%xmm10                   \n"
+      "movdqa     %%xmm9,%%xmm11                   \n"
+      "palignr    $0x8,%%xmm10,%%xmm10             \n"
+      "palignr    $0x8,%%xmm11,%%xmm11             \n"
+      "punpcklwd  %%xmm14,%%xmm12                  \n"
+      "punpcklwd  %%xmm15,%%xmm13                  \n"
+      "movdqa     %%xmm12,%%xmm14                  \n"
+      "movdqa     %%xmm13,%%xmm15                  \n"
+      "palignr    $0x8,%%xmm14,%%xmm14             \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movq       %%xmm0,(%1)                      \n"
+      "movdqa     %%xmm0,%%xmm4                    \n"
+      "palignr    $0x8,%%xmm4,%%xmm4               \n"
+      "movq       %%xmm4,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm6                    \n"
+      "movq       %%xmm2,(%1)                      \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movq       %%xmm6,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm1,%%xmm5                    \n"
+      "movq       %%xmm1,(%1)                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       %%xmm5,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movq       %%xmm3,(%1)                      \n"
+      "movdqa     %%xmm3,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "movq       %%xmm7,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm12,%%xmm8                   \n"
+      "movq       %%xmm8,(%1)                      \n"
+      "movdqa     %%xmm8,%%xmm12                   \n"
+      "palignr    $0x8,%%xmm12,%%xmm12             \n"
+      "movq       %%xmm12,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm14,%%xmm10                  \n"
+      "movdqa     %%xmm10,%%xmm14                  \n"
+      "movq       %%xmm10,(%1)                     \n"
+      "palignr    $0x8,%%xmm14,%%xmm14             \n"
+      "punpckldq  %%xmm13,%%xmm9                   \n"
+      "movq       %%xmm14,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm9,%%xmm13                   \n"
+      "movq       %%xmm9,(%1)                      \n"
+      "palignr    $0x8,%%xmm13,%%xmm13             \n"
+      "movq       %%xmm13,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm15,%%xmm11                  \n"
+      "movq       %%xmm11,(%1)                     \n"
+      "movdqa     %%xmm11,%%xmm15                  \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      "sub        $0x10,%2                         \n"
+      "movq       %%xmm15,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+        "xmm15");
 }
 #endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
 
 // Transpose UV 8x8.  64 bit.
 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     (%0),%%xmm0                      \n"
-    "movdqu     (%0,%4),%%xmm1                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "punpckhbw  %%xmm1,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm1                    \n"
-    "movdqu     (%0),%%xmm2                      \n"
-    "movdqu     (%0,%4),%%xmm3                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm2,%%xmm8                    \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "punpckhbw  %%xmm3,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm3                    \n"
-    "movdqu     (%0),%%xmm4                      \n"
-    "movdqu     (%0,%4),%%xmm5                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm4,%%xmm8                    \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "punpckhbw  %%xmm5,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm5                    \n"
-    "movdqu     (%0),%%xmm6                      \n"
-    "movdqu     (%0,%4),%%xmm7                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm6,%%xmm8                    \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %4                               \n"
-    "lea        0x10(%0,%4,8),%0                 \n"
-    "punpckhbw  %%xmm7,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm7                    \n"
-    "neg        %4                               \n"
-     // Second round of bit swap.
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "movdqa     %%xmm1,%%xmm9                    \n"
-    "punpckhwd  %%xmm2,%%xmm8                    \n"
-    "punpckhwd  %%xmm3,%%xmm9                    \n"
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm8,%%xmm2                    \n"
-    "movdqa     %%xmm9,%%xmm3                    \n"
-    "movdqa     %%xmm4,%%xmm8                    \n"
-    "movdqa     %%xmm5,%%xmm9                    \n"
-    "punpckhwd  %%xmm6,%%xmm8                    \n"
-    "punpckhwd  %%xmm7,%%xmm9                    \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm8,%%xmm6                    \n"
-    "movdqa     %%xmm9,%%xmm7                    \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
-    "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
-    "punpckhdq  %%xmm4,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm2,%%xmm8                    \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movlpd     %%xmm2,(%1)                      \n"
-    "movhpd     %%xmm2,(%2)                      \n"
-    "punpckhdq  %%xmm6,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm1,%%xmm8                    \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movlpd     %%xmm1,(%1)                      \n"
-    "movhpd     %%xmm1,(%2)                      \n"
-    "punpckhdq  %%xmm5,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm3,%%xmm8                    \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movlpd     %%xmm3,(%1)                      \n"
-    "movhpd     %%xmm3,(%2)                      \n"
-    "punpckhdq  %%xmm7,%%xmm8                    \n"
-    "sub        $0x8,%3                          \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst_a),  // %1
-      "+r"(dst_b),  // %2
-      "+r"(width)   // %3
-    : "r"((intptr_t)(src_stride)),    // %4
-      "r"((intptr_t)(dst_stride_a)),  // %5
-      "r"((intptr_t)(dst_stride_b))   // %6
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9"
-  );
+void TransposeUVWx8_SSE2(const uint8* src,
+                         int src_stride,
+                         uint8* dst_a,
+                         int dst_stride_a,
+                         uint8* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movdqu     (%0),%%xmm0                      \n"
+      "movdqu     (%0,%4),%%xmm1                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "punpckhbw  %%xmm1,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm1                    \n"
+      "movdqu     (%0),%%xmm2                      \n"
+      "movdqu     (%0,%4),%%xmm3                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm2,%%xmm8                    \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "punpckhbw  %%xmm3,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm3                    \n"
+      "movdqu     (%0),%%xmm4                      \n"
+      "movdqu     (%0,%4),%%xmm5                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm4,%%xmm8                    \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "punpckhbw  %%xmm5,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm5                    \n"
+      "movdqu     (%0),%%xmm6                      \n"
+      "movdqu     (%0,%4),%%xmm7                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm6,%%xmm8                    \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "neg        %4                               \n"
+      "lea        0x10(%0,%4,8),%0                 \n"
+      "punpckhbw  %%xmm7,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm7                    \n"
+      "neg        %4                               \n"
+      // Second round of bit swap.
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "movdqa     %%xmm1,%%xmm9                    \n"
+      "punpckhwd  %%xmm2,%%xmm8                    \n"
+      "punpckhwd  %%xmm3,%%xmm9                    \n"
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm8,%%xmm2                    \n"
+      "movdqa     %%xmm9,%%xmm3                    \n"
+      "movdqa     %%xmm4,%%xmm8                    \n"
+      "movdqa     %%xmm5,%%xmm9                    \n"
+      "punpckhwd  %%xmm6,%%xmm8                    \n"
+      "punpckhwd  %%xmm7,%%xmm9                    \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm8,%%xmm6                    \n"
+      "movdqa     %%xmm9,%%xmm7                    \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+      "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+      "punpckhdq  %%xmm4,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm2,%%xmm8                    \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movlpd     %%xmm2,(%1)                      \n"
+      "movhpd     %%xmm2,(%2)                      \n"
+      "punpckhdq  %%xmm6,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm1,%%xmm8                    \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movlpd     %%xmm1,(%1)                      \n"
+      "movhpd     %%xmm1,(%2)                      \n"
+      "punpckhdq  %%xmm5,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm3,%%xmm8                    \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movlpd     %%xmm3,(%1)                      \n"
+      "movhpd     %%xmm3,(%2)                      \n"
+      "punpckhdq  %%xmm7,%%xmm8                    \n"
+      "sub        $0x8,%3                          \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                      // %0
+        "+r"(dst_a),                    // %1
+        "+r"(dst_b),                    // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride)),    // %4
+        "r"((intptr_t)(dst_stride_a)),  // %5
+        "r"((intptr_t)(dst_stride_b))   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9");
 }
 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
 #endif  // defined(__x86_64__) || defined(__i386__)

diff --git a/files/source/rotate_msa.cc b/files/source/rotate_msa.cc
new file mode 100644
index 0000000..8907765
--- /dev/null
+++ b/files/source/rotate_msa.cc

@@ -0,0 +1,250 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0);     \
+    out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0);     \
+    out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2);     \
+    out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2);     \
+  }
+
+#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0);     \
+    out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0);     \
+    out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2);     \
+    out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2);     \
+  }
+
+#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0);     \
+    out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0);     \
+    out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2);     \
+    out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2);     \
+  }
+
+#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0);     \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0);     \
+    out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2);     \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
+  }
+
+void TransposeWx16_C(const uint8* src,
+                     int src_stride,
+                     uint8* dst,
+                     int dst_stride,
+                     int width) {
+  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+                 width);
+}
+
+void TransposeUVWx16_C(const uint8* src,
+                       int src_stride,
+                       uint8* dst_a,
+                       int dst_stride_a,
+                       uint8* dst_b,
+                       int dst_stride_b,
+                       int width) {
+  TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width);
+  TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
+                   dst_stride_a, (dst_b + 8), dst_stride_b, width);
+}
+
+void TransposeWx16_MSA(const uint8* src,
+                       int src_stride,
+                       uint8* dst,
+                       int dst_stride,
+                       int width) {
+  int x;
+  const uint8* s;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < width; x += 16) {
+    s = src;
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    src += 16;
+    dst += dst_stride * 4;
+  }
+}
+
+void TransposeUVWx16_MSA(const uint8* src,
+                         int src_stride,
+                         uint8* dst_a,
+                         int dst_stride_a,
+                         uint8* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  int x;
+  const uint8* s;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < width; x += 8) {
+    s = src;
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    src += 16;
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc
index 1c22b47..41ec34e 100644
--- a/files/source/rotate_neon.cc
+++ b/files/source/rotate_neon.cc

@@ -21,11 +21,13 @@
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
 
-static uvec8 kVTbl4x4Transpose =
-  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+static uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                  2, 6, 10, 14, 3, 7, 11, 15};
 
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride,
+void TransposeWx8_NEON(const uint8* src,
+                       int src_stride,
+                       uint8* dst,
+                       int dst_stride,
                        int width) {
   const uint8* src_temp;
   asm volatile (
@@ -240,12 +242,15 @@
   );
 }
 
-static uvec8 kVTbl4x4TransposeDi =
-  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+static uvec8 kVTbl4x4TransposeDi = {0, 8,  1, 9,  2, 10, 3, 11,
+                                    4, 12, 5, 13, 6, 14, 7, 15};
 
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8* src,
+                         int src_stride,
+                         uint8* dst_a,
+                         int dst_stride_a,
+                         uint8* dst_b,
+                         int dst_stride_b,
                          int width) {
   const uint8* src_temp;
   asm volatile (

diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc
index 1ab448f..3cf1793 100644
--- a/files/source/rotate_neon64.cc
+++ b/files/source/rotate_neon64.cc

@@ -21,13 +21,16 @@
 // This module is for GCC Neon armv8 64 bit.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
-static uvec8 kVTbl4x4Transpose =
-  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+static uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                  2, 6, 10, 14, 3, 7, 11, 15};
 
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) {
+void TransposeWx8_NEON(const uint8* src,
+                       int src_stride,
+                       uint8* dst,
+                       int dst_stride,
+                       int width) {
   const uint8* src_temp;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  int64 width64 = (int64)width;  // Work around clang 3.4 warning.
   asm volatile (
     // loops are on blocks of 8. loop will stop when
     // counter gets to or below 0. starting the counter
@@ -247,16 +250,19 @@
   );
 }
 
-static uint8 kVTbl4x4TransposeDi[32] =
-  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
-    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+static uint8 kVTbl4x4TransposeDi[32] = {
+    0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+    1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
 
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8* src,
+                         int src_stride,
+                         uint8* dst_a,
+                         int dst_stride_a,
+                         uint8* dst_b,
+                         int dst_stride_b,
                          int width) {
   const uint8* src_temp;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  int64 width64 = (int64)width;  // Work around clang 3.4 warning.
   asm volatile (
     // loops are on blocks of 8. loop will stop when
     // counter gets to or below 0. starting the counter

diff --git a/files/source/rotate_win.cc b/files/source/rotate_win.cc
index 1300fc0..201643e 100644
--- a/files/source/rotate_win.cc
+++ b/files/source/rotate_win.cc

@@ -19,15 +19,17 @@
 // This module is for 32 bit Visual C x86 and clangcl
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 
-__declspec(naked)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
+__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
+                                          int src_stride,
+                                          uint8* dst,
+                                          int dst_stride,
+                                          int width) {
   __asm {
     push      edi
     push      esi
     push      ebp
-    mov       eax, [esp + 12 + 4]   // src
-    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       eax, [esp + 12 + 4]  // src
+    mov       edi, [esp + 12 + 8]  // src_stride
     mov       edx, [esp + 12 + 12]  // dst
     mov       esi, [esp + 12 + 16]  // dst_stride
     mov       ecx, [esp + 12 + 20]  // width
@@ -110,18 +112,20 @@
   }
 }
 
-__declspec(naked)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int w) {
+__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
+                                           int src_stride,
+                                           uint8* dst_a,
+                                           int dst_stride_a,
+                                           uint8* dst_b,
+                                           int dst_stride_b,
+                                           int w) {
   __asm {
     push      ebx
     push      esi
     push      edi
     push      ebp
-    mov       eax, [esp + 16 + 4]   // src
-    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       eax, [esp + 16 + 4]  // src
+    mov       edi, [esp + 16 + 8]  // src_stride
     mov       edx, [esp + 16 + 12]  // dst_a
     mov       esi, [esp + 16 + 16]  // dst_stride_a
     mov       ebx, [esp + 16 + 20]  // dst_b
@@ -134,8 +138,8 @@
 
     align      4
  convertloop:
-    // Read in the data from the source pointer.
-    // First round of bit swap.
+                     // Read in the data from the source pointer.
+        // First round of bit swap.
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + edi]
     lea       eax, [eax + 2 * edi]
@@ -162,7 +166,7 @@
     lea       eax, [eax + 2 * edi]
     movdqu    [esp], xmm5  // backup xmm5
     neg       edi
-    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    movdqa    xmm5, xmm6             // use xmm5 as temp register.
     punpcklbw xmm6, xmm7
     punpckhbw xmm5, xmm7
     movdqa    xmm7, xmm5
@@ -183,7 +187,7 @@
     movdqa    xmm6, xmm5
     movdqu    xmm5, [esp]  // restore xmm5
     movdqu    [esp], xmm6  // backup xmm6
-    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    movdqa    xmm6, xmm5             // use xmm6 as temp register.
     punpcklwd xmm5, xmm7
     punpckhwd xmm6, xmm7
     movdqa    xmm7, xmm6
@@ -200,7 +204,7 @@
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm4
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm2  // use xmm0 as the temp register.
     punpckldq xmm2, xmm6
     movlpd    qword ptr [edx], xmm2
     movhpd    qword ptr [ebx], xmm2
@@ -209,7 +213,7 @@
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm1  // use xmm0 as the temp register.
     punpckldq xmm1, xmm5
     movlpd    qword ptr [edx], xmm1
     movhpd    qword ptr [ebx], xmm1
@@ -218,7 +222,7 @@
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm3  // use xmm0 as the temp register.
     punpckldq xmm3, xmm7
     movlpd    qword ptr [edx], xmm3
     movhpd    qword ptr [ebx], xmm3

diff --git a/files/source/row_any.cc b/files/source/row_any.cc
index 494164f..74a6621 100644
--- a/files/source/row_any.cc
+++ b/files/source/row_any.cc

@@ -23,26 +23,26 @@
 #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
 
 // Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 const uint8* a_buf, uint8* dst_ptr,                           \
-                 const struct YuvConstants* yuvconstants,  int width) {        \
-      SIMD_ALIGNED(uint8 temp[64 * 5]);                                        \
-      memset(temp, 0, 64 * 4);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      memcpy(temp + 192, a_buf + n, r);                                        \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
-               yuvconstants, MASK + 1);                                        \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)            \
+  void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+               const uint8* a_buf, uint8* dst_ptr,                         \
+               const struct YuvConstants* yuvconstants, int width) {       \
+    SIMD_ALIGNED(uint8 temp[64 * 5]);                                      \
+    memset(temp, 0, 64 * 4); /* for msan */                                \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);      \
+    }                                                                      \
+    memcpy(temp, y_buf + n, r);                                            \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));             \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    memcpy(temp + 192, a_buf + n, r);                                      \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,          \
+             yuvconstants, MASK + 1);                                      \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                    \
+           SS(r, DUVSHIFT) * BPP);                                         \
+  }
 
 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
@@ -53,26 +53,29 @@
 #ifdef HAS_I422ALPHATOARGBROW_NEON
 ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
 #endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
 #undef ANY41C
 
 // Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)             \
+  void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
+               uint8* dst_ptr, int width) {                                \
+    SIMD_ALIGNED(uint8 temp[64 * 4]);                                      \
+    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                       \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                           \
+    }                                                                      \
+    memcpy(temp, y_buf + n, r);                                            \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));             \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);           \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                    \
+           SS(r, DUVSHIFT) * BPP);                                         \
+  }
 #ifdef HAS_I422TOYUY2ROW_SSE2
 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
 ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
@@ -80,9 +83,15 @@
 #ifdef HAS_I422TOYUY2ROW_NEON
 ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_I422TOUYVYROW_NEON
 ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_BLENDPLANEROW_AVX2
 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
 #endif
@@ -95,35 +104,31 @@
 // on arm that subsamples 444 to 422 internally.
 // Any 3 planes to 1 with yuvconstants
 #define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
-                 int width) {                                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);               \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      if (width & 1) {                                                         \
-        temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
-        temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
-      }                                                                        \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192,                        \
-               yuvconstants, MASK + 1);                                        \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+  void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,     \
+               uint8* dst_ptr, const struct YuvConstants* yuvconstants,        \
+               int width) {                                                    \
+    SIMD_ALIGNED(uint8 temp[64 * 4]);                                          \
+    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                           \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);                 \
+    }                                                                          \
+    memcpy(temp, y_buf + n, r);                                                \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                 \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));                \
+    if (width & 1) {                                                           \
+      temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];               \
+      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];             \
+    }                                                                          \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                        \
+           SS(r, DUVSHIFT) * BPP);                                             \
+  }
 
 #ifdef HAS_I422TOARGBROW_SSSE3
 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
 #endif
-#ifdef HAS_I411TOARGBROW_SSSE3
-ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
-#endif
 #ifdef HAS_I444TOARGBROW_SSSE3
 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
 ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
@@ -144,9 +149,6 @@
 #ifdef HAS_I444TOARGBROW_AVX2
 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
 #endif
-#ifdef HAS_I411TOARGBROW_AVX2
-ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
-#endif
 #ifdef HAS_I422TOARGB4444ROW_AVX2
 ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
 #endif
@@ -159,32 +161,46 @@
 #ifdef HAS_I422TOARGBROW_NEON
 ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
 ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
 ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
 ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
 ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
 ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
 ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
 #endif
+#ifdef HAS_I422TOARGBROW_DSPR2
+ANY31C(I444ToARGBRow_Any_DSPR2, I444ToARGBRow_DSPR2, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_DSPR2, I422ToARGBRow_DSPR2, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
 #undef ANY31C
 
 // Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
-      }                                                                        \
-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-             SS(r, UVSHIFT) * SBPP2);                                          \
-      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)       \
+  void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
+               int width) {                                             \
+    SIMD_ALIGNED(uint8 temp[64 * 3]);                                   \
+    memset(temp, 0, 64 * 2); /* for msan */                             \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                              \
+    }                                                                   \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                           \
+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                  \
+           SS(r, UVSHIFT) * SBPP2);                                     \
+    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                    \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                     \
+  }
 
 // Merge functions.
 #ifdef HAS_MERGEUVROW_SSE2
@@ -196,6 +212,9 @@
 #ifdef HAS_MERGEUVROW_NEON
 ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
 #endif
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
 
 // Math functions.
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
@@ -225,44 +244,61 @@
 #ifdef HAS_ARGBSUBTRACTROW_NEON
 ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
 #ifdef HAS_SOBELROW_SSE2
 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
 #endif
 #ifdef HAS_SOBELROW_NEON
 ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
 #endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
 #endif
 #ifdef HAS_SOBELTOPLANEROW_NEON
 ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
 #endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
 #ifdef HAS_SOBELXYROW_SSE2
 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
 #endif
 #ifdef HAS_SOBELXYROW_NEON
 ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
 #endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
 #undef ANY21
 
 // Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
-                 int width) {                                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                     \
-      }                                                                        \
-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-             SS(r, UVSHIFT) * SBPP2);                                          \
-      ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);           \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)      \
+  void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {    \
+    SIMD_ALIGNED(uint8 temp[64 * 3]);                                   \
+    memset(temp, 0, 64 * 2); /* for msan */                             \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                \
+    }                                                                   \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                           \
+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                  \
+           SS(r, UVSHIFT) * SBPP2);                                     \
+    ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);      \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                     \
+  }
 
 // Biplanar to RGB.
 #ifdef HAS_NV12TOARGBROW_SSSE3
@@ -274,6 +310,12 @@
 #ifdef HAS_NV12TOARGBROW_NEON
 ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV12TOARGBROW_DSPR2
+ANY21C(NV12ToARGBRow_Any_DSPR2, NV12ToARGBRow_DSPR2, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
 #ifdef HAS_NV21TOARGBROW_SSSE3
 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
 #endif
@@ -283,6 +325,9 @@
 #ifdef HAS_NV21TOARGBROW_NEON
 ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
 #ifdef HAS_NV12TORGB565ROW_SSSE3
 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
 #endif
@@ -292,22 +337,25 @@
 #ifdef HAS_NV12TORGB565ROW_NEON
 ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
 #endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
 #undef ANY21C
 
 // Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
+  void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {         \
+    SIMD_ALIGNED(uint8 temp[128 * 2]);                                    \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
 
 #ifdef HAS_COPYROW_AVX
 ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
@@ -372,9 +420,21 @@
 ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
 ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
 #endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
 #if defined(HAS_RAWTORGB24ROW_NEON)
 ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
 #endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
 #ifdef HAS_ARGBTOYROW_AVX2
 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
 #endif
@@ -403,30 +463,57 @@
 #ifdef HAS_ARGBTOYROW_NEON
 ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ARGBTOYJROW_NEON
 ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_BGRATOYROW_NEON
 ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ABGRTOYROW_NEON
 ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
 #ifdef HAS_RGBATOYROW_NEON
 ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_RGB24TOYROW_NEON
 ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
 #ifdef HAS_RAWTOYROW_NEON
 ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
 #ifdef HAS_RGB565TOYROW_NEON
 ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
 #ifdef HAS_ARGB1555TOYROW_NEON
 ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
 #ifdef HAS_ARGB4444TOYROW_NEON
 ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
 #endif
@@ -434,23 +521,71 @@
 ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
 #endif
 #ifdef HAS_UYVYTOYROW_NEON
-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
 #endif
 #ifdef HAS_RGB24TOARGBROW_NEON
 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
 #endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
 #ifdef HAS_RAWTOARGBROW_NEON
 ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
 #endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
 #ifdef HAS_RGB565TOARGBROW_NEON
 ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
 #ifdef HAS_ARGB1555TOARGBROW_NEON
 ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
 #ifdef HAS_ARGB4444TOARGBROW_NEON
 ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_RGB24TOARGBROW_DSPR2
+ANY11(RGB24ToARGBRow_Any_DSPR2, RGB24ToARGBRow_DSPR2, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_DSPR2
+ANY11(RAWToARGBRow_Any_DSPR2, RAWToARGBRow_DSPR2, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_DSPR2
+ANY11(RGB565ToARGBRow_Any_DSPR2, RGB565ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_DSPR2
+ANY11(ARGB1555ToARGBRow_Any_DSPR2, ARGB1555ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_DSPR2
+ANY11(ARGB4444ToARGBRow_Any_DSPR2, ARGB4444ToARGBRow_DSPR2, 0, 2, 4, 7)
+#endif
+#ifdef HAS_BGRATOYROW_DSPR2
+ANY11(BGRAToYRow_Any_DSPR2, BGRAToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_DSPR2
+ANY11(ARGBToYRow_Any_DSPR2, ARGBToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_DSPR2
+ANY11(ABGRToYRow_Any_DSPR2, ABGRToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_DSPR2
+ANY11(RGBAToYRow_Any_DSPR2, RGBAToYRow_DSPR2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
 #endif
@@ -466,29 +601,35 @@
 #ifdef HAS_ARGBATTENUATEROW_NEON
 ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32)
+#endif
 #ifdef HAS_ARGBEXTRACTALPHAROW_NEON
 ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
 #endif
 #undef ANY11
 
 // Any 1 to 1 blended.  Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128 * 2);  /* for YUY2 and msan */                       \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                          \
-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {         \
+    SIMD_ALIGNED(uint8 temp[128 * 2]);                                    \
+    memset(temp, 0, 128 * 2); /* for YUY2 and msan */                     \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                       \
+    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
@@ -505,32 +646,51 @@
 #undef ANY11B
 
 // Any 1 to 1 with parameter.
-#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
-                 T shuffler, int width) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
-      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
-    }
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                         \
+  void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, T shuffler, int width) { \
+    SIMD_ALIGNED(uint8 temp[64 * 2]);                                         \
+    memset(temp, 0, 64); /* for msan */                                       \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                                \
+    }                                                                         \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                               \
+    ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                            \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                            \
+  }
 
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
-       const uint32, 4, 2, 3)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+       ARGBToRGB565DitherRow_SSE2,
+       const uint32,
+       4,
+       2,
+       3)
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
-       const uint32, 4, 2, 7)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+       ARGBToRGB565DitherRow_AVX2,
+       const uint32,
+       4,
+       2,
+       7)
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
-       const uint32, 4, 2, 7)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+       ARGBToRGB565DitherRow_NEON,
+       const uint32,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+       ARGBToRGB565DitherRow_MSA,
+       const uint32,
+       4,
+       2,
+       7)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_SSE2
 ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
@@ -544,23 +704,58 @@
 #ifdef HAS_ARGBSHUFFLEROW_NEON
 ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
 #endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
+#endif
 #undef ANY11P
 
+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)            \
+  void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T shuffler, \
+               int width) {                                        \
+    SIMD_ALIGNED(uint16 temp[32 * 2]);                             \
+    memset(temp, 0, 64); /* for msan */                            \
+    int r = width & MASK;                                          \
+    int n = width & ~MASK;                                         \
+    if (n > 0) {                                                   \
+      ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                     \
+    }                                                              \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                    \
+    ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                 \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                 \
+  }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
+ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 1, 1, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7)
+ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 1, 1, 7)
+#endif
+#undef ANY11P16
+
 // Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
-                 const struct YuvConstants* yuvconstants, int width) {         \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                           \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                      \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                      \
+               const struct YuvConstants* yuvconstants, int width) {      \
+    SIMD_ALIGNED(uint8 temp[128 * 2]);                                    \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
 #if defined(HAS_YUY2TOARGBROW_SSSE3)
 ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
 ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
@@ -573,25 +768,28 @@
 ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
 ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
 #endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
 #undef ANY11C
 
 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
 #define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 ptrdiff_t src_stride_ptr, int width,                          \
-                 int source_y_fraction) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
-      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+  void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, \
+               int width, int source_y_fraction) {                             \
+    SIMD_ALIGNED(uint8 temp[64 * 3]);                                          \
+    memset(temp, 0, 64 * 2); /* for msan */                                    \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);        \
+    }                                                                          \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
+    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);          \
+    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);               \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                            \
+  }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
@@ -605,22 +803,25 @@
 #ifdef HAS_INTERPOLATEROW_DSPR2
 ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
 #endif
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
+#endif
 #undef ANY11T
 
 // Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr, r * BPP);                                          \
-      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
-      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
-    }
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
+  void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {         \
+    SIMD_ALIGNED(uint8 temp[64 * 2]);                                     \
+    memset(temp, 0, 64); /* for msan */                                   \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \
+    }                                                                     \
+    memcpy(temp, src_ptr, r* BPP);                                        \
+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
+    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+  }
 
 #ifdef HAS_MIRRORROW_AVX2
 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
@@ -631,6 +832,9 @@
 #ifdef HAS_MIRRORROW_NEON
 ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
 #endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
 #ifdef HAS_ARGBMIRRORROW_AVX2
 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 #endif
@@ -640,20 +844,23 @@
 #ifdef HAS_ARGBMIRRORROW_NEON
 ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
 #endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
 #undef ANY11M
 
 // Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
-    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
-      SIMD_ALIGNED(uint8 temp[64]);                                            \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, v32, n);                                             \
-      }                                                                        \
-      ANY_SIMD(temp, v32, MASK + 1);                                           \
-      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
-    }
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)      \
+  void NAMEANY(uint8* dst_ptr, T v32, int width) { \
+    SIMD_ALIGNED(uint8 temp[64]);                  \
+    int r = width & MASK;                          \
+    int n = width & ~MASK;                         \
+    if (n > 0) {                                   \
+      ANY_SIMD(dst_ptr, v32, n);                   \
+    }                                              \
+    ANY_SIMD(temp, v32, MASK + 1);                 \
+    memcpy(dst_ptr + n * BPP, temp, r * BPP);      \
+  }
 
 #ifdef HAS_SETROW_X86
 ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
@@ -664,43 +871,26 @@
 #ifdef HAS_ARGBSETROW_NEON
 ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
 #endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32, 4, 3)
+#endif
 #undef ANY1
 
 // Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
-      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
-      memset(temp, 0, 128);  /* for msan */                                    \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      /* repeat last 4 bytes for 422 subsampler */                             \
-      if ((width & 1) && BPP == 4 && DUVSHIFT == 1) {                          \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-      }                                                                        \
-      /* repeat last 4 - 12 bytes for 411 subsampler */                        \
-      if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-        memcpy(temp + SS(r, UVSHIFT) * BPP + BPP,                              \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2);                    \
-      }                                                                        \
-      if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2);                \
-      }                                                                        \
-      if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-      }                                                                        \
-      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
-      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
-      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
-    }
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                \
+  void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) { \
+    SIMD_ALIGNED(uint8 temp[128 * 3]);                                        \
+    memset(temp, 0, 128); /* for msan */                                      \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                                     \
+    }                                                                         \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
+    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                         \
+    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));             \
+    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));             \
+  }
 
 #ifdef HAS_SPLITUVROW_SSE2
 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
@@ -727,37 +917,41 @@
 #endif
 #ifdef HAS_YUY2TOUV422ROW_NEON
 ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
-ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
 ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
 ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
 #endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
 #undef ANY12
 
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
 // 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
-      memset(temp, 0, 128 * 2);  /* for msan */                                \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
-             SS(r, UVSHIFT) * BPP);                                            \
-      if ((width & 1) && UVSHIFT == 0) {  /* repeat last pixel for subsample */\
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
-      }                                                                        \
-      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
-      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
-      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
-    }
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8* src_ptr, int src_stride_ptr, uint8* dst_u,       \
+               uint8* dst_v, int width) {                                    \
+    SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
+    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+    }                                                                        \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+             BPP);                                                           \
+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+    }                                                                        \
+    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+  }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
@@ -783,30 +977,57 @@
 #ifdef HAS_ARGBTOUVROW_NEON
 ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_ARGBTOUVJROW_NEON
 ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_BGRATOUVROW_NEON
 ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_ABGRTOUVROW_NEON
 ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_RGBATOUVROW_NEON
 ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_RGB24TOUVROW_NEON
 ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
 #endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
 #ifdef HAS_RAWTOUVROW_NEON
 ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
 #endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
 #ifdef HAS_RGB565TOUVROW_NEON
 ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB1555TOUVROW_NEON
 ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB4444TOUVROW_NEON
 ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
 #endif
@@ -816,6 +1037,24 @@
 #ifdef HAS_UYVYTOUVROW_NEON
 ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
 #endif
+#ifdef HAS_BGRATOUVROW_DSPR2
+ANY12S(BGRAToUVRow_Any_DSPR2, BGRAToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_DSPR2
+ANY12S(ABGRToUVRow_Any_DSPR2, ABGRToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_DSPR2
+ANY12S(RGBAToUVRow_Any_DSPR2, RGBAToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_DSPR2
+ANY12S(ARGBToUVRow_Any_DSPR2, ARGBToUVRow_DSPR2, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
 #undef ANY12S
 
 #ifdef __cplusplus

diff --git a/files/source/row_common.cc b/files/source/row_common.cc
index 32d2f68..bf953ee 100644
--- a/files/source/row_common.cc
+++ b/files/source/row_common.cc

@@ -40,7 +40,7 @@
   int m = v >> 31;
   return (v + m) ^ m;
 }
-#else  // USE_BRANCHLESS
+#else   // USE_BRANCHLESS
 static __inline int32 clamp0(int32 v) {
   return (v < 0) ? 0 : v;
 }
@@ -129,7 +129,8 @@
   }
 }
 
-void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555,
+                         uint8* dst_argb,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -146,7 +147,8 @@
   }
 }
 
-void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444,
+                         uint8* dst_argb,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -200,8 +202,8 @@
     uint8 b1 = src_argb[4] >> 3;
     uint8 g1 = src_argb[5] >> 2;
     uint8 r1 = src_argb[6] >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -221,8 +223,10 @@
 // endian will not affect order of the original matrix.  But the dither4
 // will containing the first pixel in the lower byte for little endian
 // or the upper byte for big endian.
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint32 dither4, int width) {
+void ARGBToRGB565DitherRow_C(const uint8* src_argb,
+                             uint8* dst_rgb,
+                             const uint32 dither4,
+                             int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
@@ -233,8 +237,8 @@
     uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
     uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
     uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -258,9 +262,8 @@
     uint8 g1 = src_argb[5] >> 3;
     uint8 r1 = src_argb[6] >> 3;
     uint8 a1 = src_argb[7] >> 7;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    *(uint32*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+                          (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -269,8 +272,7 @@
     uint8 g0 = src_argb[1] >> 3;
     uint8 r0 = src_argb[2] >> 3;
     uint8 a0 = src_argb[3] >> 7;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
   }
 }
 
@@ -285,9 +287,8 @@
     uint8 g1 = src_argb[5] >> 4;
     uint8 r1 = src_argb[6] >> 4;
     uint8 a1 = src_argb[7] >> 4;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    *(uint32*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | (b1 << 16) |
+                          (g1 << 20) | (r1 << 24) | (a1 << 28);
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -296,13 +297,12 @@
     uint8 g0 = src_argb[1] >> 4;
     uint8 r0 = src_argb[2] >> 4;
     uint8 a0 = src_argb[3] >> 4;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+    *(uint16*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
   }
 }
 
 static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
-  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
+  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
 }
 
 static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
@@ -312,41 +312,45 @@
   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 }
 
-#define MAKEROWY(NAME, R, G, B, BPP) \
-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
-                       uint8* dst_u, uint8* dst_v, int width) {                \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
-               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
-    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
-               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
-    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
-               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
-    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
-    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-  }                                                                            \
-}
+// ARGBToY_C and ARGBToUV_C
+#define MAKEROWY(NAME, R, G, B, BPP)                                     \
+  void NAME##ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+    int x;                                                               \
+    for (x = 0; x < width; ++x) {                                        \
+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);       \
+      src_argb0 += BPP;                                                  \
+      dst_y += 1;                                                        \
+    }                                                                    \
+  }                                                                      \
+  void NAME##ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,        \
+                       uint8* dst_u, uint8* dst_v, int width) {          \
+    const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                   \
+    int x;                                                               \
+    for (x = 0; x < width - 1; x += 2) {                                 \
+      uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +        \
+                  src_rgb1[B + BPP]) >>                                  \
+                 2;                                                      \
+      uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +        \
+                  src_rgb1[G + BPP]) >>                                  \
+                 2;                                                      \
+      uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +        \
+                  src_rgb1[R + BPP]) >>                                  \
+                 2;                                                      \
+      dst_u[0] = RGBToU(ar, ag, ab);                                     \
+      dst_v[0] = RGBToV(ar, ag, ab);                                     \
+      src_rgb0 += BPP * 2;                                               \
+      src_rgb1 += BPP * 2;                                               \
+      dst_u += 1;                                                        \
+      dst_v += 1;                                                        \
+    }                                                                    \
+    if (width & 1) {                                                     \
+      uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                       \
+      uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                       \
+      uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                       \
+      dst_u[0] = RGBToU(ar, ag, ab);                                     \
+      dst_v[0] = RGBToV(ar, ag, ab);                                     \
+    }                                                                    \
+  }
 
 MAKEROWY(ARGB, 2, 1, 0, 4)
 MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -382,7 +386,7 @@
 // r  0.50000 * 255 = 127.5 = 127
 
 static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
-  return (38 * r + 75 * g +  15 * b + 64) >> 7;
+  return (38 * r + 75 * g + 15 * b + 64) >> 7;
 }
 
 static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
@@ -394,41 +398,42 @@
 
 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
 
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
-void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
-                        uint8* dst_u, uint8* dst_v, int width) {               \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
-                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
-    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
-                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
-    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
-                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
-    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
-    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-  }                                                                            \
-}
+// ARGBToYJ_C and ARGBToUVJ_C
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                     \
+  void NAME##ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+    int x;                                                                \
+    for (x = 0; x < width; ++x) {                                         \
+      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);       \
+      src_argb0 += BPP;                                                   \
+      dst_y += 1;                                                         \
+    }                                                                     \
+  }                                                                       \
+  void NAME##ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,        \
+                        uint8* dst_u, uint8* dst_v, int width) {          \
+    const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                    \
+    int x;                                                                \
+    for (x = 0; x < width - 1; x += 2) {                                  \
+      uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                     \
+                      AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));        \
+      uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                     \
+                      AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));        \
+      uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                     \
+                      AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));        \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                     \
+      src_rgb0 += BPP * 2;                                                \
+      src_rgb1 += BPP * 2;                                                \
+      dst_u += 1;                                                         \
+      dst_v += 1;                                                         \
+    }                                                                     \
+    if (width & 1) {                                                      \
+      uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                          \
+      uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                          \
+      uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                          \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                     \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                     \
+    }                                                                     \
+  }
 
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
 #undef MAKEROWYJ
@@ -478,8 +483,11 @@
   }
 }
 
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
-                     uint8* dst_u, uint8* dst_v, int width) {
+void RGB565ToUVRow_C(const uint8* src_rgb565,
+                     int src_stride_rgb565,
+                     uint8* dst_u,
+                     uint8* dst_v,
+                     int width) {
   const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -525,8 +533,11 @@
   }
 }
 
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void ARGB1555ToUVRow_C(const uint8* src_argb1555,
+                       int src_stride_argb1555,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -573,8 +584,11 @@
   }
 }
 
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void ARGB4444ToUVRow_C(const uint8* src_argb4444,
+                       int src_stride_argb4444,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -622,7 +636,9 @@
 }
 
 void ARGBToUV444Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
     uint8 ab = src_argb[0];
@@ -636,41 +652,6 @@
   }
 }
 
-void ARGBToUV411Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  int x;
-  for (x = 0; x < width - 3; x += 4) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb += 16;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  // Odd width handling mimics 'any' function which replicates last pixel.
-  if ((width & 3) == 3) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 2) {
-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 1) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  }
-}
-
 void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -702,22 +683,28 @@
 
 // Apply color matrix to a row of image. Matrix is signed.
 // TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
-                          const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_C(const uint8* src_argb,
+                          uint8* dst_argb,
+                          const int8* matrix_argb,
+                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = src_argb[0];
     int g = src_argb[1];
     int r = src_argb[2];
     int a = src_argb[3];
-    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
-              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
-    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
-              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
-    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
-              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
-    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
-              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
+              a * matrix_argb[3]) >>
+             6;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
+              a * matrix_argb[7]) >>
+             6;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
+              a * matrix_argb[11]) >>
+             6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
+              a * matrix_argb[15]) >>
+             6;
     dst_argb[0] = Clamp(sb);
     dst_argb[1] = Clamp(sg);
     dst_argb[2] = Clamp(sr);
@@ -757,8 +744,11 @@
   }
 }
 
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
-                       int interval_offset, int width) {
+void ARGBQuantizeRow_C(uint8* dst_argb,
+                       int scale,
+                       int interval_size,
+                       int interval_offset,
+                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -772,9 +762,11 @@
 }
 
 #define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 24
+#define SHADE(f, v) v* f >> 24
 
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_C(const uint8* src_argb,
+                    uint8* dst_argb,
+                    int width,
                     uint32 value) {
   const uint32 b_scale = REPEAT8(value & 0xff);
   const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
@@ -799,10 +791,12 @@
 #undef SHADE
 
 #define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 16
+#define SHADE(f, v) v* f >> 16
 
-void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBMultiplyRow_C(const uint8* src_argb0,
+                       const uint8* src_argb1,
+                       uint8* dst_argb,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const uint32 b = REPEAT8(src_argb0[0]);
@@ -827,8 +821,10 @@
 
 #define SHADE(f, v) clamp255(v + f)
 
-void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                  uint8* dst_argb, int width) {
+void ARGBAddRow_C(const uint8* src_argb0,
+                  const uint8* src_argb1,
+                  uint8* dst_argb,
+                  int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const int b = src_argb0[0];
@@ -852,8 +848,10 @@
 
 #define SHADE(f, v) clamp0(f - v)
 
-void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBSubtractRow_C(const uint8* src_argb0,
+                       const uint8* src_argb1,
+                       uint8* dst_argb,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const int b = src_argb0[0];
@@ -876,8 +874,11 @@
 #undef SHADE
 
 // Sobel functions which mimics SSSE3.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
-                 uint8* dst_sobelx, int width) {
+void SobelXRow_C(const uint8* src_y0,
+                 const uint8* src_y1,
+                 const uint8* src_y2,
+                 uint8* dst_sobelx,
+                 int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int a = src_y0[i];
@@ -894,8 +895,10 @@
   }
 }
 
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
-                 uint8* dst_sobely, int width) {
+void SobelYRow_C(const uint8* src_y0,
+                 const uint8* src_y1,
+                 uint8* dst_sobely,
+                 int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int a = src_y0[i + 0];
@@ -912,8 +915,10 @@
   }
 }
 
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                uint8* dst_argb, int width) {
+void SobelRow_C(const uint8* src_sobelx,
+                const uint8* src_sobely,
+                uint8* dst_argb,
+                int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
@@ -927,8 +932,10 @@
   }
 }
 
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_y, int width) {
+void SobelToPlaneRow_C(const uint8* src_sobelx,
+                       const uint8* src_sobely,
+                       uint8* dst_y,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
@@ -938,8 +945,10 @@
   }
 }
 
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                  uint8* dst_argb, int width) {
+void SobelXYRow_C(const uint8* src_sobelx,
+                  const uint8* src_sobely,
+                  uint8* dst_argb,
+                  int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
@@ -974,75 +983,69 @@
 //  B = (Y - 16) * 1.164 - U * -2.018
 
 // Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
 // U and V contributions to R,G,B.
 #define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
+#define UG 25   /* round(0.391 * 64) */
+#define VG 52   /* round(0.813 * 64) */
 #define VR -102 /* round(-1.596 * 64) */
 
 // Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 
-#if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-#elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+#if defined(__aarch64__)  // 64 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+#elif defined(__arm__)  // 32 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #else
-const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 #endif
 
 #undef BB
@@ -1062,74 +1065,68 @@
 
 // Y contribution to R,G,B.  Scale and bias.
 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32  /* 64 / 2 */
+#define YGB 32   /* 64 / 2 */
 
 // U and V contributions to R,G,B.
 #define UB -113 /* round(-1.77200 * 64) */
-#define UG 22 /* round(0.34414 * 64) */
-#define VG 46 /* round(0.71414  * 64) */
-#define VR -90 /* round(-1.40200 * 64) */
+#define UG 22   /* round(0.34414 * 64) */
+#define VG 46   /* round(0.71414  * 64) */
+#define VR -90  /* round(-1.40200 * 64) */
 
 // Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 
 #if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #else
-const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 #endif
 
 #undef BB
@@ -1143,81 +1140,76 @@
 #undef YG
 
 // BT.709 YUV to RGB reference
-// *  R = Y                - V * -1.28033
-// *  G = Y - U *  0.21482 - V *  0.38059
-// *  B = Y - U * -2.12798
+//  R = (Y - 16) * 1.164              - V * -1.793
+//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533
+//  B = (Y - 16) * 1.164 - U * -2.112
+// See also http://www.equasys.de/colorconversion.html
 
 // Y contribution to R,G,B.  Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32  /* 64 / 2 */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
+// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.12798 * 64)) */
-#define UG 14 /* round(0.21482 * 64) */
-#define VG 24 /* round(0.38059  * 64) */
-#define VR -82 /* round(-1.28033 * 64) */
+#define UB -128 /* max(-128, round(-2.112 * 64)) */
+#define UG 14   /* round(0.213 * 64) */
+#define VG 34   /* round(0.533  * 64) */
+#define VR -115 /* round(-1.793 * 64) */
 
 // Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 
 #if defined(__aarch64__)
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #elif defined(__arm__)
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #else
-const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
-const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 #endif
 
 #undef BB
@@ -1231,8 +1223,12 @@
 #undef YG
 
 // C reference code that mimics the YUV assembly.
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
-                              uint8* b, uint8* g, uint8* r,
+static __inline void YuvPixel(uint8 y,
+                              uint8 u,
+                              uint8 v,
+                              uint8* b,
+                              uint8* g,
+                              uint8* r,
                               const struct YuvConstants* yuvconstants) {
 #if defined(__aarch64__)
   int ub = -yuvconstants->kUVToRB[0];
@@ -1264,13 +1260,13 @@
 #endif
 
   uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
-  *b = Clamp((int32)(-(u * ub         ) + y1 + bb) >> 6);
+  *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6);
   *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
-  *r = Clamp((int32)(-(         v * vr) + y1 + br) >> 6);
+  *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
 }
 
 // Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
 // C reference code that mimics the YUV assembly.
@@ -1310,8 +1306,8 @@
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
@@ -1324,8 +1320,8 @@
                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
     src_y += 1;
     src_u += 1;
@@ -1344,11 +1340,11 @@
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_u += 1;
@@ -1356,8 +1352,8 @@
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
@@ -1371,11 +1367,11 @@
                           int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = src_a[0];
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = src_a[1];
     src_y += 2;
     src_u += 1;
@@ -1384,8 +1380,8 @@
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = src_a[0];
   }
 }
@@ -1398,18 +1394,18 @@
                       int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
     src_y += 2;
     src_u += 1;
     src_v += 1;
     rgb_buf += 6;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
   }
 }
 
@@ -1435,8 +1431,8 @@
     b1 = b1 >> 4;
     g1 = g1 >> 4;
     r1 = r1 >> 4;
-    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
+                               (g1 << 20) | (r1 << 24) | 0xf000f000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1447,8 +1443,7 @@
     b0 = b0 >> 4;
     g0 = g0 >> 4;
     r0 = r0 >> 4;
-    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        0xf000;
+    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
   }
 }
 
@@ -1474,8 +1469,8 @@
     b1 = b1 >> 3;
     g1 = g1 >> 3;
     r1 = r1 >> 3;
-    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
+                               (g1 << 21) | (r1 << 26) | 0x80008000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1486,8 +1481,7 @@
     b0 = b0 >> 3;
     g0 = g0 >> 3;
     r0 = r0 >> 3;
-    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        0x8000;
+    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
   }
 }
 
@@ -1513,8 +1507,8 @@
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint32*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1529,48 +1523,6 @@
   }
 }
 
-void I411ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 3; x += 4) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    YuvPixel(src_y[2], src_u[0], src_v[0],
-             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
-    rgb_buf[11] = 255;
-    YuvPixel(src_y[3], src_u[0], src_v[0],
-             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
-    rgb_buf[15] = 255;
-    src_y += 4;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 16;  // Advance 4 pixels.
-  }
-  if (width & 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
 void NV12ToARGBRow_C(const uint8* src_y,
                      const uint8* src_uv,
                      uint8* rgb_buf,
@@ -1578,19 +1530,19 @@
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_uv[0], src_uv[1],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_uv += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
@@ -1602,19 +1554,19 @@
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_vu[1], src_vu[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_vu += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
@@ -1640,8 +1592,8 @@
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint32*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
     src_y += 2;
     src_uv += 2;
     dst_rgb565 += 4;  // Advance 2 pixels.
@@ -1661,18 +1613,18 @@
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_yuy2 += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
@@ -1683,18 +1635,18 @@
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_uyvy += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
@@ -1707,11 +1659,11 @@
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
     rgb_buf[0] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+             rgb_buf + 7, yuvconstants);
     rgb_buf[4] = 255;
     src_y += 2;
     src_u += 1;
@@ -1719,8 +1671,8 @@
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
     rgb_buf[0] = 255;
   }
 }
@@ -1800,7 +1752,9 @@
   }
 }
 
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_C(const uint8* src_u,
+                  const uint8* src_v,
+                  uint8* dst_uv,
                   int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1837,8 +1791,11 @@
 }
 
 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_C(const uint8* src_yuy2,
+                   int src_stride_yuy2,
+                   uint8* dst_u,
+                   uint8* dst_v,
+                   int width) {
   // Output a row of UV values, filtering 2 rows of YUY2.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1852,7 +1809,9 @@
 
 // Copy row of YUY2 UV's (422) into U and V (422).
 void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1879,8 +1838,11 @@
 }
 
 // Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_C(const uint8* src_uyvy,
+                   int src_stride_uyvy,
+                   uint8* dst_u,
+                   uint8* dst_v,
+                   int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1894,7 +1856,9 @@
 
 // Copy row of UYVY UV's (422) into U and V (422).
 void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1925,8 +1889,10 @@
 // Blend src_argb0 over src_argb1 and store to dst_argb.
 // dst_argb may be src_argb0 or src_argb1.
 // This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                    uint8* dst_argb, int width) {
+void ARGBBlendRow_C(const uint8* src_argb0,
+                    const uint8* src_argb1,
+                    uint8* dst_argb,
+                    int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     uint32 fb = src_argb0[0];
@@ -1973,9 +1939,12 @@
 }
 #undef BLEND
 
-#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
-                     const uint8* alpha, uint8* dst, int width) {
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8* src0,
+                     const uint8* src1,
+                     const uint8* alpha,
+                     uint8* dst,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
@@ -2039,38 +2008,43 @@
 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
 #define T(a) 0x01000000 + (0x10000 / a)
 const uint32 fixed_invtbl8[256] = {
-  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
-  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
-  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
-  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
-  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
-  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
-  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
-  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
-  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
-  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
-  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
-  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
-  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
-  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
-  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
-  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
-  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
-  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
-  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
-  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
-  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
-  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
-  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
-  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
-  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
-  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
-  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
-  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
-  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
-  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
-  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
-  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+    0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
+    T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
+    T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
+    T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
+    T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
+    T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
+    T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
+    T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
+    T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
+    T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
+    T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
+    T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
+    T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
+    T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
+    T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
+    T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
+    T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
+    T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
+    T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
+    T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
+    T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
+    T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
+    T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
+    T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
+    T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
+    T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
+    T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
+    T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
+    T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
+    T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
+    T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
+    T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
+    T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
+    T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
+    T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
+    T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
+    T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
 #undef T
 
 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
@@ -2094,8 +2068,10 @@
   }
 }
 
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
-                               const int32* previous_cumsum, int width) {
+void ComputeCumulativeSumRow_C(const uint8* row,
+                               int32* cumsum,
+                               const int32* previous_cumsum,
+                               int width) {
   int32 row_sum[4] = {0, 0, 0, 0};
   int x;
   for (x = 0; x < width; ++x) {
@@ -2103,15 +2079,19 @@
     row_sum[1] += row[x * 4 + 1];
     row_sum[2] += row[x * 4 + 2];
     row_sum[3] += row[x * 4 + 3];
-    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
-    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
-    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
-    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
   }
 }
 
-void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
-                                int w, int area, uint8* dst, int count) {
+void CumulativeSumToAverageRow_C(const int32* tl,
+                                 const int32* bl,
+                                 int w,
+                                 int area,
+                                 uint8* dst,
+                                 int count) {
   float ooa = 1.0f / area;
   int i;
   for (i = 0; i < count; ++i) {
@@ -2127,8 +2107,11 @@
 
 // Copy pixels from rotated source to destination row with a slope.
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width) {
+void ARGBAffineRow_C(const uint8* src_argb,
+                     int src_argb_stride,
+                     uint8* dst_argb,
+                     const float* uv_dudv,
+                     int width) {
   int i;
   // Render a row of pixels from source into a buffer.
   float uv[2];
@@ -2138,8 +2121,7 @@
     int x = (int)(uv[0]);
     int y = (int)(uv[1]);
     *(uint32*)(dst_argb) =
-        *(const uint32*)(src_argb + y * src_argb_stride +
-                                         x * 4);
+        *(const uint32*)(src_argb + y * src_argb_stride + x * 4);
     dst_argb += 4;
     uv[0] += uv_dudv[2];
     uv[1] += uv_dudv[3];
@@ -2147,16 +2129,20 @@
 }
 
 // Blend 2 rows into 1.
-static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,
-                      uint8* dst_uv, int width) {
+static void HalfRow_C(const uint8* src_uv,
+                      ptrdiff_t src_uv_stride,
+                      uint8* dst_uv,
+                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
   }
 }
 
-static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
-                         uint16* dst_uv, int width) {
+static void HalfRow_16_C(const uint16* src_uv,
+                         ptrdiff_t src_uv_stride,
+                         uint16* dst_uv,
+                         int width) {
   int x;
   for (x = 0; x < width; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
@@ -2164,10 +2150,12 @@
 }
 
 // C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+void InterpolateRow_C(uint8* dst_ptr,
+                      const uint8* src_ptr,
                       ptrdiff_t src_stride,
-                      int width, int source_y_fraction) {
-  int y1_fraction = source_y_fraction ;
+                      int width,
+                      int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint8* src_ptr1 = src_ptr + src_stride;
   int x;
@@ -2194,9 +2182,11 @@
   }
 }
 
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+void InterpolateRow_16_C(uint16* dst_ptr,
+                         const uint16* src_ptr,
                          ptrdiff_t src_stride,
-                         int width, int source_y_fraction) {
+                         int width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint16* src_ptr1 = src_ptr + src_stride;
@@ -2222,8 +2212,10 @@
 }
 
 // Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-                      const uint8* shuffler, int width) {
+void ARGBShuffleRow_C(const uint8* src_argb,
+                      uint8* dst_argb,
+                      const uint8* shuffler,
+                      int width) {
   int index0 = shuffler[0];
   int index1 = shuffler[1];
   int index2 = shuffler[2];
@@ -2248,7 +2240,8 @@
 void I422ToYUY2Row_C(const uint8* src_y,
                      const uint8* src_u,
                      const uint8* src_v,
-                     uint8* dst_frame, int width) {
+                     uint8* dst_frame,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_frame[0] = src_y[0];
@@ -2271,7 +2264,8 @@
 void I422ToUYVYRow_C(const uint8* src_y,
                      const uint8* src_u,
                      const uint8* src_v,
-                     uint8* dst_frame, int width) {
+                     uint8* dst_frame,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_frame[0] = src_u[0];
@@ -2291,7 +2285,6 @@
   }
 }
 
-
 void ARGBPolynomialRow_C(const uint8* src_argb,
                          uint8* dst_argb,
                          const float* poly,
@@ -2332,8 +2325,30 @@
   }
 }
 
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                             const uint8* luma, uint32 lumacoeff) {
+// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
+// adjust the source integer range to the half float range desired.
+
+// This magic constant is 2^-112. Multiplying by this
+// is the same as subtracting 112 from the exponent, which
+// is the difference in exponent bias between 32-bit and
+// 16-bit floats. Once we've done this subtraction, we can
+// simply extract the low bits of the exponent and the high
+// bits of the mantissa from our float and we're done.
+
+void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) {
+  int i;
+  float mult = 1.9259299444e-34f * scale;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * mult;
+    dst[i] = (uint16)((*(uint32_t*)&value) >> 13);
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb,
+                             uint8* dst_argb,
+                             int width,
+                             const uint8* luma,
+                             uint32 lumacoeff) {
   uint32 bc = lumacoeff & 0xff;
   uint32 gc = (lumacoeff >> 8) & 0xff;
   uint32 rc = (lumacoeff >> 16) & 0xff;
@@ -2341,15 +2356,17 @@
   int i;
   for (i = 0; i < width - 1; i += 2) {
     // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
+    const uint8* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
     const uint8* luma1;
     dst_argb[0] = luma0[src_argb[0]];
     dst_argb[1] = luma0[src_argb[1]];
     dst_argb[2] = luma0[src_argb[2]];
     dst_argb[3] = src_argb[3];
-    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
-              src_argb[6] * rc) & 0x7F00u) + luma;
+    luma1 =
+        ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
+        luma;
     dst_argb[4] = luma1[src_argb[4]];
     dst_argb[5] = luma1[src_argb[5]];
     dst_argb[6] = luma1[src_argb[6]];
@@ -2359,8 +2376,9 @@
   }
   if (width & 1) {
     // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
+    const uint8* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
     dst_argb[0] = luma0[src_argb[0]];
     dst_argb[1] = luma0[src_argb[1]];
     dst_argb[2] = luma0[src_argb[2]];
@@ -2504,7 +2522,7 @@
                           uint8* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2530,7 +2548,7 @@
                             const struct YuvConstants* yuvconstants,
                             int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2556,7 +2574,7 @@
                             const struct YuvConstants* yuvconstants,
                             int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2576,13 +2594,13 @@
 
 #if defined(HAS_I422TORGB24ROW_AVX2)
 void I422ToRGB24Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_rgb24,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2604,7 +2622,7 @@
                           const struct YuvConstants* yuvconstants,
                           int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);

diff --git a/files/source/row_dspr2.cc b/files/source/row_dspr2.cc
new file mode 100644
index 0000000..466dd5d
--- /dev/null
+++ b/files/source/row_dspr2.cc

@@ -0,0 +1,1721 @@
+/*
+ *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+  __asm__ __volatile__(
+      ".set      noreorder                         \n"
+      ".set      noat                              \n"
+      "slti      $at, %[count], 8                  \n"
+      "bne       $at ,$zero, $last8                \n"
+      "xor       $t8, %[src], %[dst]               \n"
+      "andi      $t8, $t8, 0x3                     \n"
+
+      "bne       $t8, $zero, unaligned             \n"
+      "negu      $a3, %[dst]                       \n"
+      // make dst/src aligned
+      "andi      $a3, $a3, 0x3                     \n"
+      "beq       $a3, $zero, $chk16w               \n"
+      // word-aligned now count is the remining bytes count
+      "subu     %[count], %[count], $a3            \n"
+
+      "lwr       $t8, 0(%[src])                    \n"
+      "addu      %[src], %[src], $a3               \n"
+      "swr       $t8, 0(%[dst])                    \n"
+      "addu      %[dst], %[dst], $a3               \n"
+
+      // Now the dst/src are mutually word-aligned with word-aligned addresses
+      "$chk16w:                                    \n"
+      "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+      // t8 is the byte count after 64-byte chunks
+      "beq       %[count], $t8, chk8w              \n"
+      // There will be at most 1 32-byte chunk after it
+      "subu      $a3, %[count], $t8                \n"  // the reminder
+      // Here a3 counts bytes in 16w chunks
+      "addu      $a3, %[dst], $a3                  \n"
+      // Now a3 is the final dst after 64-byte chunks
+      "addu      $t0, %[dst], %[count]             \n"
+      // t0 is the "past the end" address
+
+      // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be
+      // past
+      // the "t0-32" address
+      // This means: for x=128 the last "safe" a1 address is "t0-160"
+      // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+      // we will use "pref 30,128(a1)", so "t0-160" is the limit
+      "subu      $t9, $t0, 160                     \n"
+      // t9 is the "last safe pref 30,128(a1)" address
+      "pref      0, 0(%[src])                      \n"  // first line of src
+      "pref      0, 32(%[src])                     \n"  // second line of src
+      "pref      0, 64(%[src])                     \n"
+      "pref      30, 32(%[dst])                    \n"
+      // In case the a1 > t9 don't use "pref 30" at all
+      "sgtu      $v1, %[dst], $t9                  \n"
+      "bgtz      $v1, $loop16w                     \n"
+      "nop                                         \n"
+      // otherwise, start with using pref30
+      "pref      30, 64(%[dst])                    \n"
+      "$loop16w:                                    \n"
+      "pref      0, 96(%[src])                     \n"
+      "lw        $t0, 0(%[src])                    \n"
+      "bgtz      $v1, $skip_pref30_96              \n"  // skip
+      "lw        $t1, 4(%[src])                    \n"
+      "pref      30, 96(%[dst])                    \n"  // continue
+      "$skip_pref30_96:                            \n"
+      "lw        $t2, 8(%[src])                    \n"
+      "lw        $t3, 12(%[src])                   \n"
+      "lw        $t4, 16(%[src])                   \n"
+      "lw        $t5, 20(%[src])                   \n"
+      "lw        $t6, 24(%[src])                   \n"
+      "lw        $t7, 28(%[src])                   \n"
+      "pref      0, 128(%[src])                    \n"
+      //  bring the next lines of src, addr 128
+      "sw        $t0, 0(%[dst])                    \n"
+      "sw        $t1, 4(%[dst])                    \n"
+      "sw        $t2, 8(%[dst])                    \n"
+      "sw        $t3, 12(%[dst])                   \n"
+      "sw        $t4, 16(%[dst])                   \n"
+      "sw        $t5, 20(%[dst])                   \n"
+      "sw        $t6, 24(%[dst])                   \n"
+      "sw        $t7, 28(%[dst])                   \n"
+      "lw        $t0, 32(%[src])                   \n"
+      "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
+      "lw        $t1, 36(%[src])                   \n"
+      "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
+      "$skip_pref30_128:                           \n"
+      "lw        $t2, 40(%[src])                   \n"
+      "lw        $t3, 44(%[src])                   \n"
+      "lw        $t4, 48(%[src])                   \n"
+      "lw        $t5, 52(%[src])                   \n"
+      "lw        $t6, 56(%[src])                   \n"
+      "lw        $t7, 60(%[src])                   \n"
+      "pref      0, 160(%[src])                    \n"
+      // bring the next lines of src, addr 160
+      "sw        $t0, 32(%[dst])                   \n"
+      "sw        $t1, 36(%[dst])                   \n"
+      "sw        $t2, 40(%[dst])                   \n"
+      "sw        $t3, 44(%[dst])                   \n"
+      "sw        $t4, 48(%[dst])                   \n"
+      "sw        $t5, 52(%[dst])                   \n"
+      "sw        $t6, 56(%[dst])                   \n"
+      "sw        $t7, 60(%[dst])                   \n"
+
+      "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
+      "sgtu      $v1, %[dst], $t9                  \n"
+      "bne       %[dst], $a3, $loop16w             \n"
+      " addiu    %[src], %[src], 64                \n"  // adding 64 to src
+      "move      %[count], $t8                     \n"
+
+      // Here we have src and dest word-aligned but less than 64-bytes to go
+
+      "chk8w:                                      \n"
+      "pref      0, 0x0(%[src])                    \n"
+      "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+      // the t8 is the reminder count past 32-bytes
+      "beq       %[count], $t8, chk1w              \n"
+      // count=t8,no 32-byte chunk
+      " nop                                        \n"
+
+      "lw        $t0, 0(%[src])                    \n"
+      "lw        $t1, 4(%[src])                    \n"
+      "lw        $t2, 8(%[src])                    \n"
+      "lw        $t3, 12(%[src])                   \n"
+      "lw        $t4, 16(%[src])                   \n"
+      "lw        $t5, 20(%[src])                   \n"
+      "lw        $t6, 24(%[src])                   \n"
+      "lw        $t7, 28(%[src])                   \n"
+      "addiu     %[src], %[src], 32                \n"
+
+      "sw        $t0, 0(%[dst])                    \n"
+      "sw        $t1, 4(%[dst])                    \n"
+      "sw        $t2, 8(%[dst])                    \n"
+      "sw        $t3, 12(%[dst])                   \n"
+      "sw        $t4, 16(%[dst])                   \n"
+      "sw        $t5, 20(%[dst])                   \n"
+      "sw        $t6, 24(%[dst])                   \n"
+      "sw        $t7, 28(%[dst])                   \n"
+      "addiu     %[dst], %[dst], 32                \n"
+
+      "chk1w:                                      \n"
+      "andi      %[count], $t8, 0x3                \n"
+      // now count is the reminder past 1w chunks
+      "beq       %[count], $t8, $last8             \n"
+      " subu     $a3, $t8, %[count]                \n"
+      // a3 is count of bytes in 1w chunks
+      "addu      $a3, %[dst], $a3                  \n"
+      // now a3 is the dst address past the 1w chunks
+      // copying in words (4-byte chunks)
+      "$wordCopy_loop:                             \n"
+      "lw        $t3, 0(%[src])                    \n"
+      // the first t3 may be equal t0 ... optimize?
+      "addiu     %[src], %[src],4                  \n"
+      "addiu     %[dst], %[dst],4                  \n"
+      "bne       %[dst], $a3,$wordCopy_loop        \n"
+      " sw       $t3, -4(%[dst])                   \n"
+
+      // For the last (<8) bytes
+      "$last8:                                     \n"
+      "blez      %[count], leave                   \n"
+      " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
+      "$last8loop:                                 \n"
+      "lb        $v1, 0(%[src])                    \n"
+      "addiu     %[src], %[src], 1                 \n"
+      "addiu     %[dst], %[dst], 1                 \n"
+      "bne       %[dst], $a3, $last8loop           \n"
+      " sb       $v1, -1(%[dst])                   \n"
+
+      "leave:                                      \n"
+      "  j       $ra                               \n"
+      "  nop                                       \n"
+
+      //
+      // UNALIGNED case
+      //
+
+      "unaligned:                                  \n"
+      // got here with a3="negu a1"
+      "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
+      "beqz      $a3, $ua_chk16w                   \n"
+      " subu     %[count], %[count], $a3           \n"
+      // bytes left after initial a3 bytes
+      "lwr       $v1, 0(%[src])                    \n"
+      "lwl       $v1, 3(%[src])                    \n"
+      "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
+      "swr       $v1, 0(%[dst])                    \n"
+      "addu      %[dst], %[dst], $a3               \n"
+      // below the dst will be word aligned (NOTE1)
+      "$ua_chk16w:                                 \n"
+      "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+      // t8 is the byte count after 64-byte chunks
+      "beq       %[count], $t8, ua_chk8w           \n"
+      // if a2==t8, no 64-byte chunks
+      // There will be at most 1 32-byte chunk after it
+      "subu      $a3, %[count], $t8                \n"  // the reminder
+      // Here a3 counts bytes in 16w chunks
+      "addu      $a3, %[dst], $a3                  \n"
+      // Now a3 is the final dst after 64-byte chunks
+      "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
+      "subu      $t9, $t0, 160                     \n"
+      // t9 is the "last safe pref 30,128(a1)" address
+      "pref      0, 0(%[src])                      \n"  // first line of src
+      "pref      0, 32(%[src])                     \n"  // second line  addr 32
+      "pref      0, 64(%[src])                     \n"
+      "pref      30, 32(%[dst])                    \n"
+      // safe, as we have at least 64 bytes ahead
+      // In case the a1 > t9 don't use "pref 30" at all
+      "sgtu      $v1, %[dst], $t9                  \n"
+      "bgtz      $v1, $ua_loop16w                  \n"
+      // skip "pref 30,64(a1)" for too short arrays
+      " nop                                        \n"
+      // otherwise, start with using pref30
+      "pref      30, 64(%[dst])                    \n"
+      "$ua_loop16w:                                \n"
+      "pref      0, 96(%[src])                     \n"
+      "lwr       $t0, 0(%[src])                    \n"
+      "lwl       $t0, 3(%[src])                    \n"
+      "lwr       $t1, 4(%[src])                    \n"
+      "bgtz      $v1, $ua_skip_pref30_96           \n"
+      " lwl      $t1, 7(%[src])                    \n"
+      "pref      30, 96(%[dst])                    \n"
+      // continue setting up the dest, addr 96
+      "$ua_skip_pref30_96:                         \n"
+      "lwr       $t2, 8(%[src])                    \n"
+      "lwl       $t2, 11(%[src])                   \n"
+      "lwr       $t3, 12(%[src])                   \n"
+      "lwl       $t3, 15(%[src])                   \n"
+      "lwr       $t4, 16(%[src])                   \n"
+      "lwl       $t4, 19(%[src])                   \n"
+      "lwr       $t5, 20(%[src])                   \n"
+      "lwl       $t5, 23(%[src])                   \n"
+      "lwr       $t6, 24(%[src])                   \n"
+      "lwl       $t6, 27(%[src])                   \n"
+      "lwr       $t7, 28(%[src])                   \n"
+      "lwl       $t7, 31(%[src])                   \n"
+      "pref      0, 128(%[src])                    \n"
+      // bring the next lines of src, addr 128
+      "sw        $t0, 0(%[dst])                    \n"
+      "sw        $t1, 4(%[dst])                    \n"
+      "sw        $t2, 8(%[dst])                    \n"
+      "sw        $t3, 12(%[dst])                   \n"
+      "sw        $t4, 16(%[dst])                   \n"
+      "sw        $t5, 20(%[dst])                   \n"
+      "sw        $t6, 24(%[dst])                   \n"
+      "sw        $t7, 28(%[dst])                   \n"
+      "lwr       $t0, 32(%[src])                   \n"
+      "lwl       $t0, 35(%[src])                   \n"
+      "lwr       $t1, 36(%[src])                   \n"
+      "bgtz      $v1, ua_skip_pref30_128           \n"
+      " lwl      $t1, 39(%[src])                   \n"
+      "pref      30, 128(%[dst])                   \n"
+      // continue setting up the dest, addr 128
+      "ua_skip_pref30_128:                         \n"
+
+      "lwr       $t2, 40(%[src])                   \n"
+      "lwl       $t2, 43(%[src])                   \n"
+      "lwr       $t3, 44(%[src])                   \n"
+      "lwl       $t3, 47(%[src])                   \n"
+      "lwr       $t4, 48(%[src])                   \n"
+      "lwl       $t4, 51(%[src])                   \n"
+      "lwr       $t5, 52(%[src])                   \n"
+      "lwl       $t5, 55(%[src])                   \n"
+      "lwr       $t6, 56(%[src])                   \n"
+      "lwl       $t6, 59(%[src])                   \n"
+      "lwr       $t7, 60(%[src])                   \n"
+      "lwl       $t7, 63(%[src])                   \n"
+      "pref      0, 160(%[src])                    \n"
+      // bring the next lines of src, addr 160
+      "sw        $t0, 32(%[dst])                   \n"
+      "sw        $t1, 36(%[dst])                   \n"
+      "sw        $t2, 40(%[dst])                   \n"
+      "sw        $t3, 44(%[dst])                   \n"
+      "sw        $t4, 48(%[dst])                   \n"
+      "sw        $t5, 52(%[dst])                   \n"
+      "sw        $t6, 56(%[dst])                   \n"
+      "sw        $t7, 60(%[dst])                   \n"
+
+      "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
+      "sgtu      $v1,%[dst],$t9                    \n"
+      "bne       %[dst],$a3,$ua_loop16w            \n"
+      " addiu    %[src],%[src],64                  \n"  // adding 64 to src
+      "move      %[count],$t8                      \n"
+
+      // Here we have src and dest word-aligned but less than 64-bytes to go
+
+      "ua_chk8w:                                   \n"
+      "pref      0, 0x0(%[src])                    \n"
+      "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+      // the t8 is the reminder count
+      "beq       %[count], $t8, $ua_chk1w          \n"
+      // when count==t8, no 32-byte chunk
+
+      "lwr       $t0, 0(%[src])                    \n"
+      "lwl       $t0, 3(%[src])                    \n"
+      "lwr       $t1, 4(%[src])                    \n"
+      "lwl       $t1, 7(%[src])                    \n"
+      "lwr       $t2, 8(%[src])                    \n"
+      "lwl       $t2, 11(%[src])                   \n"
+      "lwr       $t3, 12(%[src])                   \n"
+      "lwl       $t3, 15(%[src])                   \n"
+      "lwr       $t4, 16(%[src])                   \n"
+      "lwl       $t4, 19(%[src])                   \n"
+      "lwr       $t5, 20(%[src])                   \n"
+      "lwl       $t5, 23(%[src])                   \n"
+      "lwr       $t6, 24(%[src])                   \n"
+      "lwl       $t6, 27(%[src])                   \n"
+      "lwr       $t7, 28(%[src])                   \n"
+      "lwl       $t7, 31(%[src])                   \n"
+      "addiu     %[src], %[src], 32                \n"
+
+      "sw        $t0, 0(%[dst])                    \n"
+      "sw        $t1, 4(%[dst])                    \n"
+      "sw        $t2, 8(%[dst])                    \n"
+      "sw        $t3, 12(%[dst])                   \n"
+      "sw        $t4, 16(%[dst])                   \n"
+      "sw        $t5, 20(%[dst])                   \n"
+      "sw        $t6, 24(%[dst])                   \n"
+      "sw        $t7, 28(%[dst])                   \n"
+      "addiu     %[dst], %[dst], 32                \n"
+
+      "$ua_chk1w:                                  \n"
+      "andi      %[count], $t8, 0x3                \n"
+      // now count is the reminder past 1w chunks
+      "beq       %[count], $t8, ua_smallCopy       \n"
+      "subu      $a3, $t8, %[count]                \n"
+      // a3 is count of bytes in 1w chunks
+      "addu      $a3, %[dst], $a3                  \n"
+      // now a3 is the dst address past the 1w chunks
+
+      // copying in words (4-byte chunks)
+      "$ua_wordCopy_loop:                          \n"
+      "lwr       $v1, 0(%[src])                    \n"
+      "lwl       $v1, 3(%[src])                    \n"
+      "addiu     %[src], %[src], 4                 \n"
+      "addiu     %[dst], %[dst], 4                 \n"
+      // note: dst=a1 is word aligned here, see NOTE1
+      "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
+      " sw       $v1,-4(%[dst])                    \n"
+
+      // Now less than 4 bytes (value in count) left to copy
+      "ua_smallCopy:                               \n"
+      "beqz      %[count], leave                   \n"
+      " addu     $a3, %[dst], %[count]             \n"  // a3 = last dst address
+      "$ua_smallCopy_loop:                         \n"
+      "lb        $v1, 0(%[src])                    \n"
+      "addiu     %[src], %[src], 1                 \n"
+      "addiu     %[dst], %[dst], 1                 \n"
+      "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
+      " sb       $v1, -1(%[dst])                   \n"
+
+      "j         $ra                               \n"
+      " nop                                        \n"
+      ".set      at                                \n"
+      ".set      reorder                           \n"
+      : [dst] "+r"(dst), [src] "+r"(src)
+      : [count] "r"(count)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1",
+        "at");
+}
+#endif  // HAS_COPYROW_MIPS
+
+// DSPR2 functions
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) &&   \
+    (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \
+    (__mips_isa_rev < 6)
+
+void SplitUVRow_DSPR2(const uint8* src_uv,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
+  __asm__ __volatile__(
+      ".set push                                     \n"
+      ".set noreorder                                \n"
+      "srl             $t4, %[width], 4              \n"  // multiplies of 16
+      "blez            $t4, 2f                       \n"
+      " andi           %[width], %[width], 0xf       \n"  // residual
+
+      "1:                                            \n"
+      "addiu           $t4, $t4, -1                  \n"
+      "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
+      "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
+      "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
+      "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
+      "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
+      "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 |
+                                                          // U10
+      "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 |
+                                                          // U12
+      "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 |
+                                                          // U14
+      "addiu           %[src_uv], %[src_uv], 32      \n"
+      "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+      "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+      "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+      "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+      "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+      "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+      "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 |
+                                                          // V12
+      "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 |
+                                                          // U12
+      "sw              $t9, 0(%[dst_v])              \n"
+      "sw              $t0, 0(%[dst_u])              \n"
+      "sw              $t1, 4(%[dst_v])              \n"
+      "sw              $t2, 4(%[dst_u])              \n"
+      "sw              $t3, 8(%[dst_v])              \n"
+      "sw              $t5, 8(%[dst_u])              \n"
+      "sw              $t6, 12(%[dst_v])             \n"
+      "sw              $t7, 12(%[dst_u])             \n"
+      "addiu           %[dst_v], %[dst_v], 16        \n"
+      "bgtz            $t4, 1b                       \n"
+      " addiu          %[dst_u], %[dst_u], 16        \n"
+
+      "beqz            %[width], 3f                  \n"
+      " nop                                          \n"
+
+      "2:                                              \n"
+      "lbu             $t0, 0(%[src_uv])             \n"
+      "lbu             $t1, 1(%[src_uv])             \n"
+      "addiu           %[src_uv], %[src_uv], 2       \n"
+      "addiu           %[width], %[width], -1        \n"
+      "sb              $t0, 0(%[dst_u])              \n"
+      "sb              $t1, 0(%[dst_v])              \n"
+      "addiu           %[dst_u], %[dst_u], 1         \n"
+      "bgtz            %[width], 2b                  \n"
+      " addiu          %[dst_v], %[dst_v], 1         \n"
+
+      "3:                                              \n"
+      ".set pop                                      \n"
+      : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u),
+        [dst_v] "+r"(dst_v)
+      :
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
+  __asm__ __volatile__(
+      ".set push                             \n"
+      ".set noreorder                        \n"
+
+      "srl       $t4, %[width], 4            \n"  // multiplies of 16
+      "andi      $t5, %[width], 0xf          \n"
+      "blez      $t4, 2f                     \n"
+      " addu     %[src], %[src], %[width]    \n"  // src += width
+
+      "1:                                     \n"
+      "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
+      "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
+      "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
+      "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
+      "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
+      "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
+      "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
+      "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
+      "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
+      "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
+      "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
+      "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
+      "addiu     %[src], %[src], -16         \n"
+      "addiu     $t4, $t4, -1                \n"
+      "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
+      "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
+      "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
+      "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
+      "bgtz      $t4, 1b                     \n"
+      " addiu    %[dst], %[dst], 16          \n"
+      "beqz      $t5, 3f                     \n"
+      " nop                                  \n"
+
+      "2:                                     \n"
+      "lbu       $t0, -1(%[src])             \n"
+      "addiu     $t5, $t5, -1                \n"
+      "addiu     %[src], %[src], -1          \n"
+      "sb        $t0, 0(%[dst])              \n"
+      "bgez      $t5, 2b                     \n"
+      " addiu    %[dst], %[dst], 1           \n"
+
+      "3:                                     \n"
+      ".set pop                              \n"
+      : [src] "+r"(src), [dst] "+r"(dst)
+      : [width] "r"(width)
+      : "t0", "t1", "t2", "t3", "t4", "t5");
+}
+
+void MirrorUVRow_DSPR2(const uint8* src_uv,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  int x;
+  int y;
+  __asm__ __volatile__(
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "addu            $t4, %[width], %[width]      \n"
+      "srl             %[x], %[width], 4            \n"
+      "andi            %[y], %[width], 0xf          \n"
+      "blez            %[x], 2f                     \n"
+      " addu           %[src_uv], %[src_uv], $t4    \n"
+
+      "1:                                          \n"
+      "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
+      "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
+      "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
+      "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
+      "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
+      "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
+      "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
+      "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
+
+      "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
+      "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
+      "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
+      "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
+      "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
+      "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
+      "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
+      "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
+      "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
+      "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
+      "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
+      "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
+      "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
+      "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
+      "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
+      "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
+      "addiu           %[src_uv], %[src_uv], -32    \n"
+      "addiu           %[x], %[x], -1               \n"
+      "swr             $t4, 0(%[dst_u])             \n"
+      "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
+      "swr             $t6, 0(%[dst_v])             \n"
+      "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
+      "swr             $t2, 4(%[dst_u])             \n"
+      "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
+      "swr             $t3, 4(%[dst_v])             \n"
+      "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
+      "swr             $t0, 8(%[dst_u])             \n"
+      "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
+      "swr             $t1, 8(%[dst_v])             \n"
+      "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
+      "swr             $t9, 12(%[dst_u])            \n"
+      "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
+      "swr             $t5, 12(%[dst_v])            \n"
+      "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
+      "addiu           %[dst_v], %[dst_v], 16       \n"
+      "bgtz            %[x], 1b                     \n"
+      " addiu          %[dst_u], %[dst_u], 16       \n"
+      "beqz            %[y], 3f                     \n"
+      " nop                                         \n"
+      "b               2f                           \n"
+      " nop                                         \n"
+
+      "2:                                            \n"
+      "lbu             $t0, -2(%[src_uv])           \n"
+      "lbu             $t1, -1(%[src_uv])           \n"
+      "addiu           %[src_uv], %[src_uv], -2     \n"
+      "addiu           %[y], %[y], -1               \n"
+      "sb              $t0, 0(%[dst_u])             \n"
+      "sb              $t1, 0(%[dst_v])             \n"
+      "addiu           %[dst_u], %[dst_u], 1        \n"
+      "bgtz            %[y], 2b                     \n"
+      " addiu          %[dst_v], %[dst_v], 1        \n"
+
+      "3:                                            \n"
+      ".set pop                                     \n"
+      : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),
+        [x] "=&r"(x), [y] "=&r"(y)
+      : [width] "r"(width)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");
+}
+
+void I422ToARGBRow_DSPR2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_yg;
+  uint32 tmp_mask = 0x7fff7fff;
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+    __asm__ __volatile__(
+        ".set push                                             \n"
+        ".set noreorder                                        \n"
+        "lbu              %[tmp_t7], 0(%[src_y])               \n"
+        "lbu              %[tmp_t1], 1(%[src_y])               \n"
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
+        "lbu              %[tmp_t2], 0(%[src_u])               \n"
+        "lbu              %[tmp_t3], 0(%[src_v])               \n"
+        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
+        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.ph.w      %[tmp_t9], %[tmp_t8],     %[tmp_t7]  \n"
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
+        "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
+        ".set pop                                              \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+        : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+          [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg),
+          [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
+          [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
+          [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 4 pixels.
+  }
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_DSPR2(uint8* dst_ptr,
+                          const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  int y0_fraction = 256 - source_y_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+
+  __asm__ __volatile__(
+      ".set push                                           \n"
+      ".set noreorder                                      \n"
+
+      "replv.ph          $t0, %[y0_fraction]               \n"
+      "replv.ph          $t1, %[source_y_fraction]         \n"
+
+      "1:                                                    \n"
+      "lw                $t2, 0(%[src_ptr])                \n"
+      "lw                $t3, 0(%[src_ptr1])               \n"
+      "lw                $t4, 4(%[src_ptr])                \n"
+      "lw                $t5, 4(%[src_ptr1])               \n"
+      "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
+      "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
+      "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
+      "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
+      "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
+      "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
+      "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
+      "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
+      "addq.ph           $t6, $t6, $t8                     \n"
+      "addq.ph           $t7, $t7, $t9                     \n"
+      "addq.ph           $t2, $t2, $t4                     \n"
+      "addq.ph           $t3, $t3, $t5                     \n"
+      "shra_r.ph         $t6, $t6, 8                       \n"
+      "shra_r.ph         $t7, $t7, 8                       \n"
+      "shra_r.ph         $t2, $t2, 8                       \n"
+      "shra_r.ph         $t3, $t3, 8                       \n"
+      "precr.qb.ph       $t6, $t6, $t7                     \n"
+      "precr.qb.ph       $t2, $t2, $t3                     \n"
+      "addiu             %[src_ptr], %[src_ptr], 8         \n"
+      "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
+      "addiu             %[dst_width], %[dst_width], -8    \n"
+      "sw                $t6, 0(%[dst_ptr])                \n"
+      "sw                $t2, 4(%[dst_ptr])                \n"
+      "bgtz              %[dst_width], 1b                  \n"
+      " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
+
+      ".set pop                                            \n"
+      : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1),
+        [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width)
+      : [source_y_fraction] "r"(source_y_fraction),
+        [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
+}
+#include <stdio.h>
+void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  int x;
+  uint32 tmp_mask = 0xff;
+  uint32 tmp_t1;
+  for (x = 0; x < (width - 1); ++x) {
+    __asm__ __volatile__(
+        ".set push                                                  \n"
+        ".set noreorder                                             \n"
+        "ulw             %[tmp_t1],    0(%[src_rgb24])              \n"
+        "addiu           %[dst_argb],  %[dst_argb],     4           \n"
+        "addiu           %[src_rgb24], %[src_rgb24],    3           \n"
+        "ins             %[tmp_t1],    %[tmp_mask],     24,    8    \n"
+        "sw              %[tmp_t1],    -4(%[dst_argb])              \n"
+        ".set pop                                                   \n"
+        : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb),
+          [tmp_t1] "=&r"(tmp_t1)
+        : [tmp_mask] "r"(tmp_mask)
+        : "memory");
+  }
+  uint8 b = src_rgb24[0];
+  uint8 g = src_rgb24[1];
+  uint8 r = src_rgb24[2];
+  dst_argb[0] = b;
+  dst_argb[1] = g;
+  dst_argb[2] = r;
+  dst_argb[3] = 255u;
+}
+
+void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {
+  int x;
+  uint32 tmp_mask = 0xff;
+  uint32 tmp_t1, tmp_t2;
+  for (x = 0; x < (width - 1); ++x) {
+    __asm__ __volatile__(
+        ".set push                                               \n"
+        ".set noreorder                                          \n"
+        "ulw               %[tmp_t1],   0(%[src_raw])            \n"
+        "addiu             %[dst_argb], %[dst_argb],      4      \n"
+        "addiu             %[src_raw],  %[src_raw],       3      \n"
+        "srl               %[tmp_t2],   %[tmp_t1],        16     \n"
+        "ins               %[tmp_t1],   %[tmp_mask],      24, 8  \n"
+        "ins               %[tmp_t1],   %[tmp_t1],        16, 8  \n"
+        "ins               %[tmp_t1],   %[tmp_t2],        0,  8  \n"
+        "sw                %[tmp_t1],   -4(%[dst_argb])          \n"
+        ".set pop                                                \n"
+        : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb),
+          [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2)
+        : [tmp_mask] "r"(tmp_mask)
+        : "memory");
+  }
+  uint8 r = src_raw[0];
+  uint8 g = src_raw[1];
+  uint8 b = src_raw[2];
+  dst_argb[0] = b;
+  dst_argb[1] = g;
+  dst_argb[2] = r;
+  dst_argb[3] = 255u;
+}
+
+void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565,
+                           uint8* dst_argb,
+                           int width) {
+  int x;
+  uint32 tmp_mask = 0xff;
+  uint32 tmp_t1, tmp_t2, tmp_t3;
+  for (x = 0; x < width; ++x) {
+    __asm__ __volatile__(
+        ".set push                                                   \n"
+        ".set noreorder                                              \n"
+        "lhu               %[tmp_t1],     0(%[src_rgb565])           \n"
+        "addiu             %[dst_argb],   %[dst_argb],      4        \n"
+        "addiu             %[src_rgb565], %[src_rgb565],    2        \n"
+        "sll               %[tmp_t2],     %[tmp_t1],        8        \n"
+        "ins               %[tmp_t2],     %[tmp_mask],      24,8     \n"
+        "ins               %[tmp_t2],     %[tmp_t1],        3, 16    \n"
+        "ins               %[tmp_t2],     %[tmp_t1],        5, 11    \n"
+        "srl               %[tmp_t3],     %[tmp_t1],        9        \n"
+        "ins               %[tmp_t2],     %[tmp_t3],        8, 2     \n"
+        "ins               %[tmp_t2],     %[tmp_t1],        3, 5     \n"
+        "srl               %[tmp_t3],     %[tmp_t1],        2        \n"
+        "ins               %[tmp_t2],     %[tmp_t3],        0, 3     \n"
+        "sw                %[tmp_t2],     -4(%[dst_argb])            \n"
+        ".set pop                                                    \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565),
+          [dst_argb] "+r"(dst_argb)
+        : [tmp_mask] "r"(tmp_mask));
+  }
+}
+
+void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
+                             uint8* dst_argb,
+                             int width) {
+  int x;
+  uint32 tmp_t1, tmp_t2, tmp_t3;
+  for (x = 0; x < width; ++x) {
+    __asm__ __volatile__(
+        ".set push                                                   \n"
+        ".set noreorder                                              \n"
+        "lh                %[tmp_t1],       0(%[src_argb1555])       \n"
+        "addiu             %[dst_argb],     %[dst_argb],      4      \n"
+        "addiu             %[src_argb1555], %[src_argb1555],  2      \n"
+        "sll               %[tmp_t2],       %[tmp_t1],        9      \n"
+        "ins               %[tmp_t2],       %[tmp_t1],        4, 15  \n"
+        "ins               %[tmp_t2],       %[tmp_t1],        6, 10  \n"
+        "srl               %[tmp_t3],       %[tmp_t1],        7      \n"
+        "ins               %[tmp_t2],       %[tmp_t3],        8, 3   \n"
+        "ins               %[tmp_t2],       %[tmp_t1],        3, 5   \n"
+        "srl               %[tmp_t3],       %[tmp_t1],        2      \n"
+        "ins               %[tmp_t2],       %[tmp_t3],        0, 3   \n"
+        "sw                %[tmp_t2],       -4(%[dst_argb])          \n"
+        ".set pop                                                    \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555),
+          [dst_argb] "+r"(dst_argb)
+        :);
+  }
+}
+
+void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
+                             uint8* dst_argb,
+                             int width) {
+  int x;
+  uint32 tmp_t1;
+  for (x = 0; x < width; ++x) {
+    __asm__ __volatile__(
+        ".set push                                                    \n"
+        ".set noreorder                                               \n"
+        "lh                %[tmp_t1],       0(%[src_argb4444])        \n"
+        "addiu             %[dst_argb],     %[dst_argb],       4      \n"
+        "addiu             %[src_argb4444], %[src_argb4444],   2      \n"
+        "ins               %[tmp_t1],       %[tmp_t1],         16, 16 \n"
+        "ins               %[tmp_t1],       %[tmp_t1],         12, 16 \n"
+        "ins               %[tmp_t1],       %[tmp_t1],         8,  12 \n"
+        "ins               %[tmp_t1],       %[tmp_t1],         4,  8  \n"
+        "sw                %[tmp_t1],       -4(%[dst_argb])           \n"
+        ".set pop                                                     \n"
+        : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb),
+          [tmp_t1] "=&r"(tmp_t1));
+  }
+}
+
+void I444ToARGBRow_DSPR2(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_mask = 0x7fff7fff;
+  uint32 tmp_yg;
+
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+    __asm__ __volatile__(
+        ".set push                                              \n"
+        ".set noreorder                                         \n"
+        "lbu              %[tmp_t7], 0(%[y_buf])               \n"
+        "lbu              %[tmp_t1], 1(%[y_buf])               \n"
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
+        "lh               %[tmp_t2], 0(%[u_buf])               \n"
+        "lh               %[tmp_t3], 0(%[v_buf])               \n"
+        "preceu.ph.qbr    %[tmp_t2], %[tmp_t2]                 \n"
+        "preceu.ph.qbr    %[tmp_t3], %[tmp_t3]                 \n"
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
+        "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
+        "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
+        ".set pop                                              \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+        : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf),
+          [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug),
+          [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
+          [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
+          [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
+    y_buf += 2;
+    u_buf += 2;
+    v_buf += 2;
+    rgb_buf += 8;  // Advance 1 pixel.
+  }
+}
+
+void I422ToARGB4444Row_DSPR2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb4444,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_yg;
+  uint32 tmp_mask = 0x7fff7fff;
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+    __asm__ __volatile__(
+        ".set push                                             \n"
+        ".set noreorder                                        \n"
+        "lbu              %[tmp_t7], 0(%[src_y])               \n"
+        "lbu              %[tmp_t1], 1(%[src_y])               \n"
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
+        "lbu              %[tmp_t2], 0(%[src_u])               \n"
+        "lbu              %[tmp_t3], 0(%[src_v])               \n"
+        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
+        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
+        "shrl.qb          %[tmp_t1], %[tmp_t8],     4          \n"
+        "shrl.qb          %[tmp_t2], %[tmp_t7],     4          \n"
+        "shrl.ph          %[tmp_t8], %[tmp_t1],     4          \n"
+        "shrl.ph          %[tmp_t7], %[tmp_t2],     4          \n"
+        "or               %[tmp_t8], %[tmp_t8],     %[tmp_t1]  \n"
+        "or               %[tmp_t7], %[tmp_t7],     %[tmp_t2]  \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t7],     %[tmp_t8]  \n"
+        "sw               %[tmp_t8], 0(%[dst_argb4444])        \n"
+        ".set pop                                              \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+        : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u),
+          [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
+          [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
+          [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
+          [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb4444 += 4;  // Advance 2 pixels.
+  }
+}
+
+void I422ToARGB1555Row_DSPR2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb1555,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_yg;
+  uint32 tmp_mask = 0x80008000;
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+    __asm__ __volatile__(
+        ".set push                                             \n"
+        ".set noreorder                                        \n"
+        "lbu              %[tmp_t7], 0(%[src_y])               \n"
+        "lbu              %[tmp_t1], 1(%[src_y])               \n"
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
+        "lbu              %[tmp_t2], 0(%[src_u])               \n"
+        "lbu              %[tmp_t3], 0(%[src_v])               \n"
+        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
+        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
+        "ins              %[tmp_t3], %[tmp_t8],     7,      24 \n"
+        "ins              %[tmp_t3], %[tmp_t8],     10,     16 \n"
+        "ins              %[tmp_t3], %[tmp_t8],     13,     8  \n"
+        "ins              %[tmp_t4], %[tmp_t7],     7,      24 \n"
+        "ins              %[tmp_t4], %[tmp_t7],     10,     16 \n"
+        "ins              %[tmp_t4], %[tmp_t7],     13,     8  \n"
+        "precrq.ph.w      %[tmp_t8], %[tmp_t4],     %[tmp_t3]  \n"
+        "or               %[tmp_t8], %[tmp_t8],     %[tmp_mask]\n"
+        "sw               %[tmp_t8], 0(%[dst_argb1555])        \n"
+        ".set pop                                              \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+        : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u),
+          [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
+          [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
+          [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
+          [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb1555 += 4;  // Advance 2 pixels.
+  }
+}
+
+void NV12ToARGBRow_DSPR2(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  uint32 tmp_ub = yuvconstants->kUVToB[0];
+  uint32 tmp_ug = yuvconstants->kUVToG[0];
+  uint32 tmp_vg = yuvconstants->kUVToG[1];
+  uint32 tmp_vr = yuvconstants->kUVToR[1];
+  uint32 tmp_bb = yuvconstants->kUVBiasB[0];
+  uint32 tmp_bg = yuvconstants->kUVBiasG[0];
+  uint32 tmp_br = yuvconstants->kUVBiasR[0];
+  uint32 yg = yuvconstants->kYToRgb[0];
+  uint32 tmp_mask = 0x7fff7fff;
+  uint32 tmp_yg;
+  tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
+  tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
+  tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
+  tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
+  tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
+  tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
+  tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
+  tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
+  yg = yg * 0x0101;
+
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
+    __asm__ __volatile__(
+        ".set push                                             \n"
+        ".set noreorder                                        \n"
+        "lbu              %[tmp_t7], 0(%[src_y])               \n"
+        "lbu              %[tmp_t1], 1(%[src_y])               \n"
+        "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
+        "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
+        "lbu              %[tmp_t2], 0(%[src_uv])              \n"
+        "lbu              %[tmp_t3], 1(%[src_uv])              \n"
+        "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
+        "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
+        "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
+        "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
+        "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
+        "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
+        "srl              %[tmp_t7], %[tmp_t7],     16         \n"
+        "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
+        "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
+        "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
+        "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
+        "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
+        "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
+        "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
+        "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
+        "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
+        "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
+        "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
+        "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
+        "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
+        "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
+        "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
+        "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
+        "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
+        "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
+        "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
+        "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
+        ".set pop                                              \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
+        : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg),
+          [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg),
+          [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg),
+          [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf),
+          [tmp_mask] "r"(tmp_mask));
+
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+}
+
+void BGRAToUVRow_DSPR2(const uint8* src_rgb0,
+                       int src_stride_rgb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+  int x;
+  int const1 = 0xffda0000;
+  int const2 = 0x0070ffb6;
+  int const3 = 0x00700000;
+  int const4 = 0xffeeffa2;
+  int const5 = 0x100;
+  for (x = 0; x < width - 1; x += 2) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                 \n"
+        ".set noreorder                                            \n"
+        "lw                %[tmp_t1],   0(%[src_rgb0])             \n"
+        "lw                %[tmp_t2],   4(%[src_rgb0])             \n"
+        "lw                %[tmp_t3],   0(%[src_rgb1])             \n"
+        "lw                %[tmp_t4],   4(%[src_rgb1])             \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                  \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                  \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                  \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                  \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                  \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                  \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                  \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                  \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]   \n"
+        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]   \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]   \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]   \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]   \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]   \n"
+        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2           \n"
+        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2           \n"
+        "mult              $ac0,        %[const5],     %[const5]   \n"
+        "mult              $ac1,        %[const5],     %[const5]   \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]   \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]   \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]   \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]   \n"
+        "extr_r.w          %[tmp_t7],   $ac0,          9           \n"
+        "extr_r.w          %[tmp_t8],   $ac1,          9           \n"
+        "addiu             %[dst_u],    %[dst_u],    1             \n"
+        "addiu             %[dst_v],    %[dst_v],    1             \n"
+        "addiu             %[src_rgb0], %[src_rgb0], 8             \n"
+        "addiu             %[src_rgb1], %[src_rgb1], 8             \n"
+        "sb                %[tmp_t7],   -1(%[dst_u])               \n"
+        "sb                %[tmp_t8],   -1(%[dst_v])               \n"
+        ".set pop                                                  \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+          [const4] "r"(const4), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi");
+  }
+}
+
+void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  int const1 = 0x00420000;
+  int const2 = 0x00190081;
+  int const5 = 0x40;
+  for (x = 0; x < width; x += 4) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
+        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
+        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
+        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "mult              $ac2,        %[const5],     %[const5]  \n"
+        "mult              $ac3,        %[const5],     %[const5]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
+        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
+        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
+        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
+        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
+        "addiu             %[src_argb0],%[src_argb0],  16         \n"
+        "addiu             %[dst_y],    %[dst_y],      4          \n"
+        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
+        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
+        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
+        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+          "$ac3hi");
+  }
+}
+
+void ABGRToUVRow_DSPR2(const uint8* src_rgb0,
+                       int src_stride_rgb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+  int x;
+  int const1 = 0xffb6ffda;
+  int const2 = 0x00000070;
+  int const3 = 0xffa20070;
+  int const4 = 0x0000ffee;
+  int const5 = 0x100;
+
+  for (x = 0; x < width - 1; x += 2) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_rgb0])            \n"
+        "lw                %[tmp_t2],   4(%[src_rgb0])            \n"
+        "lw                %[tmp_t3],   0(%[src_rgb1])            \n"
+        "lw                %[tmp_t4],   4(%[src_rgb1])            \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
+        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
+        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
+        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
+        "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
+        "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
+        "addiu             %[dst_u],    %[dst_u],    1            \n"
+        "addiu             %[dst_v],    %[dst_v],    1            \n"
+        "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
+        "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
+        "sb                %[tmp_t7],   -1(%[dst_u])              \n"
+        "sb                %[tmp_t8],   -1(%[dst_v])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+          [const4] "r"(const4), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi");
+  }
+}
+
+void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  int const1 = 0x00810019;
+  int const2 = 0x00000042;
+  int const5 = 0x40;
+  for (x = 0; x < width; x += 4) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
+        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
+        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
+        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "mult              $ac2,        %[const5],     %[const5]  \n"
+        "mult              $ac3,        %[const5],     %[const5]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
+        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
+        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
+        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
+        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
+        "addiu             %[dst_y],    %[dst_y],      4          \n"
+        "addiu             %[src_argb0],%[src_argb0],  16         \n"
+        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
+        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
+        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
+        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+          "$ac3hi");
+  }
+}
+
+void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  int const1 = 0x00810042;
+  int const2 = 0x00000019;
+  int const5 = 0x40;
+  for (x = 0; x < width; x += 4) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
+        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
+        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
+        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "mult              $ac2,        %[const5],     %[const5]  \n"
+        "mult              $ac3,        %[const5],     %[const5]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
+        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
+        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
+        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
+        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
+        "addiu             %[src_argb0],%[src_argb0],  16         \n"
+        "addiu             %[dst_y],    %[dst_y],      4          \n"
+        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
+        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
+        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
+        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+          "$ac3hi");
+  }
+}
+
+void RGBAToUVRow_DSPR2(const uint8* src_rgb0,
+                       int src_stride_rgb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+  int x;
+  int const1 = 0xffb60070;
+  int const2 = 0x0000ffda;
+  int const3 = 0xffa2ffee;
+  int const4 = 0x00000070;
+  int const5 = 0x100;
+
+  for (x = 0; x < width - 1; x += 2) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "ulw               %[tmp_t1],   0+1(%[src_rgb0])          \n"
+        "ulw               %[tmp_t2],   4+1(%[src_rgb0])          \n"
+        "ulw               %[tmp_t3],   0+1(%[src_rgb1])          \n"
+        "ulw               %[tmp_t4],   4+1(%[src_rgb1])          \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
+        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
+        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
+        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
+        "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
+        "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
+        "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
+        "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
+        "addiu             %[dst_u],    %[dst_u],    1            \n"
+        "addiu             %[dst_v],    %[dst_v],    1            \n"
+        "sb                %[tmp_t7],   -1(%[dst_u])              \n"
+        "sb                %[tmp_t8],   -1(%[dst_v])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+          [const4] "r"(const4), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi");
+  }
+}
+
+void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  int const1 = 0x00420081;
+  int const2 = 0x00190000;
+  int const5 = 0x40;
+  for (x = 0; x < width; x += 4) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_argb0])           \n"
+        "lw                %[tmp_t2],   4(%[src_argb0])           \n"
+        "lw                %[tmp_t3],   8(%[src_argb0])           \n"
+        "lw                %[tmp_t4],   12(%[src_argb0])          \n"
+        "preceu.ph.qbl     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbr     %[tmp_t4],   %[tmp_t4]                 \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "mult              $ac2,        %[const5],     %[const5]  \n"
+        "mult              $ac3,        %[const5],     %[const5]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
+        "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
+        "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
+        "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
+        "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
+        "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
+        "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
+        "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
+        "addiu             %[dst_y],    %[dst_y],      4          \n"
+        "addiu             %[src_argb0],%[src_argb0],  16         \n"
+        "sb                %[tmp_t1],   -4(%[dst_y])              \n"
+        "sb                %[tmp_t2],   -3(%[dst_y])              \n"
+        "sb                %[tmp_t3],   -2(%[dst_y])              \n"
+        "sb                %[tmp_t4],   -1(%[dst_y])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
+        : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
+          "$ac3hi");
+  }
+}
+
+void ARGBToUVRow_DSPR2(const uint8* src_rgb0,
+                       int src_stride_rgb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
+  int x;
+  int const1 = 0xffb60070;
+  int const2 = 0x0000ffda;
+  int const3 = 0xffa2ffee;
+  int const4 = 0x00000070;
+  int const5 = 0x100;
+
+  for (x = 0; x < width - 1; x += 2) {
+    int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
+    int tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t1],   0(%[src_rgb0])            \n"
+        "lw                %[tmp_t2],   4(%[src_rgb0])            \n"
+        "lw                %[tmp_t3],   0(%[src_rgb1])            \n"
+        "lw                %[tmp_t4],   4(%[src_rgb1])            \n"
+        "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
+        "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
+        "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
+        "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
+        "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
+        "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
+        "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
+        "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
+        "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
+        "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
+        "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
+        "mult              $ac0,        %[const5],     %[const5]  \n"
+        "mult              $ac1,        %[const5],     %[const5]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
+        "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
+        "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
+        "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
+        "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
+        "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
+        "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
+        "addiu             %[dst_u],    %[dst_u],    1            \n"
+        "addiu             %[dst_v],    %[dst_v],    1            \n"
+        "sb                %[tmp_t7],   -1(%[dst_u])              \n"
+        "sb                %[tmp_t8],   -1(%[dst_v])              \n"
+        ".set pop                                                 \n"
+        : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
+          [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
+          [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+          [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
+          [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
+          [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
+        : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
+          [const4] "r"(const4), [const5] "r"(const5)
+        : "hi", "lo", "$ac1lo", "$ac1hi");
+  }
+}
+
+#endif  // __mips_dsp_rev >= 2
+
+#endif  // defined(__mips__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
index 1ac7ef1..8735070 100644
--- a/files/source/row_gcc.cc
+++ b/files/source/row_gcc.cc

@@ -1,4 +1,3 @@
-// VERSION 2
 /*
  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
@@ -23,165 +22,133 @@
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 // Constants for ARGB
-static vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+                        13, 65, 33, 0, 13, 65, 33, 0};
 
 // JPeg full range.
-static vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+                         15, 75, 38, 0, 15, 75, 38, 0};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
-static vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                        112, -74, -38, 0, 112, -74, -38, 0};
 
-static vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                         127, -84, -43, 0, 127, -84, -43, 0};
 
 static vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
 
-static vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                         -20, -107, 127, 0, -20, -107, 127, 0};
 
 // Constants for BGRA
-static vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+                        0, 33, 65, 13, 0, 33, 65, 13};
 
-static vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                        0, -38, -74, 112, 0, -38, -74, 112};
 
-static vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                        0, 112, -94, -18, 0, 112, -94, -18};
 
 // Constants for ABGR
-static vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+                        33, 65, 13, 0, 33, 65, 13, 0};
 
-static vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                        -38, -74, 112, 0, -38, -74, 112, 0};
 
-static vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                        112, -94, -18, 0, 112, -94, -18, 0};
 
 // Constants for RGBA.
-static vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+                        0, 13, 65, 33, 0, 13, 65, 33};
 
-static vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                        0, 112, -74, -38, 0, 112, -74, -38};
 
-static vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                        0, -18, -94, 112, 0, -18, -94, 112};
 
-static uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+                        16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
 
 // 7 bit fixed point 0.5.
-static vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
+static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
 
-static uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                          128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
-static uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
+static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                            0x8080u, 0x8080u, 0x8080u, 0x8080u};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
 
 // Shuffle table for converting RGB24 to ARGB.
-static uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u,  5u,  13u,
+                                        6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
 
 // Shuffle table for converting RAW to ARGB.
-static uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                      8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
 
 // Shuffle table for converting RAW to RGB24.  First 8.
 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Middle 8.
 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Last 8.
 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RGB24.
 static uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RAW.
 static uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
 static uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
 
 // YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
 
 // YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
 
 // UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
 
 // UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
 
 // NV21 shuf 8 VU to 16 UV.
 static const lvec8 kShuffleNV21 = {
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
 };
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
@@ -191,7 +158,7 @@
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0x18,%%xmm5                    \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x8,0) ",%0            \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
@@ -220,7 +187,7 @@
     "pslld     $0x18,%%xmm5                    \n"
     "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
@@ -258,7 +225,7 @@
     "pslld     $0x18,%%xmm5                    \n"
     "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
@@ -296,7 +263,7 @@
    "movdqa     %4,%%xmm4                       \n"
    "movdqa     %5,%%xmm5                       \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
@@ -338,7 +305,7 @@
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
@@ -385,7 +352,7 @@
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
@@ -429,7 +396,7 @@
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "pand      %%xmm4,%%xmm0                   \n"
@@ -461,7 +428,7 @@
   asm volatile (
     "movdqa    %3,%%xmm6                       \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -499,7 +466,7 @@
   asm volatile (
     "movdqa    %3,%%xmm6                       \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -543,7 +510,7 @@
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0xb,%%xmm5                     \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
@@ -569,98 +536,99 @@
   );
 }
 
-void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "movd       %3,%%xmm6                      \n"
-    "punpcklbw  %%xmm6,%%xmm6                  \n"
-    "movdqa     %%xmm6,%%xmm7                  \n"
-    "punpcklwd  %%xmm6,%%xmm6                  \n"
-    "punpckhwd  %%xmm7,%%xmm7                  \n"
-    "pcmpeqb    %%xmm3,%%xmm3                  \n"
-    "psrld      $0x1b,%%xmm3                   \n"
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrld      $0x1a,%%xmm4                   \n"
-    "pslld      $0x5,%%xmm4                    \n"
-    "pcmpeqb    %%xmm5,%%xmm5                  \n"
-    "pslld      $0xb,%%xmm5                    \n"
+void ARGBToRGB565DitherRow_SSE2(const uint8* src,
+                                uint8* dst,
+                                const uint32 dither4,
+                                int width) {
+  asm volatile(
+      "movd       %3,%%xmm6                      \n"
+      "punpcklbw  %%xmm6,%%xmm6                  \n"
+      "movdqa     %%xmm6,%%xmm7                  \n"
+      "punpcklwd  %%xmm6,%%xmm6                  \n"
+      "punpckhwd  %%xmm7,%%xmm7                  \n"
+      "pcmpeqb    %%xmm3,%%xmm3                  \n"
+      "psrld      $0x1b,%%xmm3                   \n"
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrld      $0x1a,%%xmm4                   \n"
+      "pslld      $0x5,%%xmm4                    \n"
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "pslld      $0xb,%%xmm5                    \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu     (%0),%%xmm0                    \n"
-    "paddusb    %%xmm6,%%xmm0                  \n"
-    "movdqa     %%xmm0,%%xmm1                  \n"
-    "movdqa     %%xmm0,%%xmm2                  \n"
-    "pslld      $0x8,%%xmm0                    \n"
-    "psrld      $0x3,%%xmm1                    \n"
-    "psrld      $0x5,%%xmm2                    \n"
-    "psrad      $0x10,%%xmm0                   \n"
-    "pand       %%xmm3,%%xmm1                  \n"
-    "pand       %%xmm4,%%xmm2                  \n"
-    "pand       %%xmm5,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm1                  \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "packssdw   %%xmm0,%%xmm0                  \n"
-    "lea        0x10(%0),%0                    \n"
-    "movq       %%xmm0,(%1)                    \n"
-    "lea        0x8(%1),%1                     \n"
-    "sub        $0x4,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(dither4) // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "paddusb    %%xmm6,%%xmm0                  \n"
+      "movdqa     %%xmm0,%%xmm1                  \n"
+      "movdqa     %%xmm0,%%xmm2                  \n"
+      "pslld      $0x8,%%xmm0                    \n"
+      "psrld      $0x3,%%xmm1                    \n"
+      "psrld      $0x5,%%xmm2                    \n"
+      "psrad      $0x10,%%xmm0                   \n"
+      "pand       %%xmm3,%%xmm1                  \n"
+      "pand       %%xmm4,%%xmm2                  \n"
+      "pand       %%xmm5,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm1                  \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "packssdw   %%xmm0,%%xmm0                  \n"
+      "lea        0x10(%0),%0                    \n"
+      "movq       %%xmm0,(%1)                    \n"
+      "lea        0x8(%1),%1                     \n"
+      "sub        $0x4,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "vbroadcastss %3,%%xmm6                    \n"
-    "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
-    "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
-    "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
-    "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
-    "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
-    "vpslld     $0x5,%%ymm4,%%ymm4             \n"
-    "vpslld     $0xb,%%ymm3,%%ymm5             \n"
+void ARGBToRGB565DitherRow_AVX2(const uint8* src,
+                                uint8* dst,
+                                const uint32 dither4,
+                                int width) {
+  asm volatile(
+      "vbroadcastss %3,%%xmm6                    \n"
+      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
+      "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
+      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
+      "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
+      "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
+      "vpslld     $0x5,%%ymm4,%%ymm4             \n"
+      "vpslld     $0xb,%%ymm3,%%ymm5             \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
-    "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
-    "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
-    "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "lea        0x20(%0),%0                    \n"
-    "vmovdqu    %%xmm0,(%1)                    \n"
-    "lea        0x10(%1),%1                    \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(dither4) // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
+      "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
+      "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
+      "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "lea        0x20(%0),%0                    \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
-
 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
   asm volatile (
     "pcmpeqb   %%xmm4,%%xmm4                   \n"
@@ -671,8 +639,9 @@
     "pslld     $0xa,%%xmm6                     \n"
     "pcmpeqb   %%xmm7,%%xmm7                   \n"
     "pslld     $0xf,%%xmm7                     \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
@@ -708,8 +677,9 @@
     "psllw     $0xc,%%xmm4                     \n"
     "movdqa    %%xmm4,%%xmm3                   \n"
     "psrlw     $0x8,%%xmm3                     \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "pand      %%xmm3,%%xmm0                   \n"
@@ -737,8 +707,9 @@
   asm volatile (
     "movdqa    %3,%%xmm4                       \n"
     "movdqa    %4,%%xmm5                       \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -775,8 +746,9 @@
   asm volatile (
     "movdqa    %3,%%xmm4                       \n"
     "movdqa    %4,%%xmm5                       \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -809,9 +781,7 @@
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
@@ -819,8 +789,9 @@
     "vbroadcastf128 %3,%%ymm4                  \n"
     "vbroadcastf128 %4,%%ymm5                  \n"
     "vmovdqu    %5,%%ymm6                      \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
@@ -860,8 +831,9 @@
     "vbroadcastf128 %3,%%ymm4                  \n"
     "vbroadcastf128 %4,%%ymm5                  \n"
     "vmovdqu    %5,%%ymm6                      \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
@@ -896,15 +868,19 @@
 #endif  // HAS_ARGBTOYJROW_AVX2
 
 #ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVRow_SSSE3(const uint8* src_argb0,
+                       int src_stride_argb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   asm volatile (
     "movdqa    %5,%%xmm3                       \n"
     "movdqa    %6,%%xmm4                       \n"
     "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
@@ -961,18 +937,21 @@
 #ifdef HAS_ARGBTOUVROW_AVX2
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+void ARGBToUVRow_AVX2(const uint8* src_argb0,
+                      int src_stride_argb,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "vbroadcastf128 %5,%%ymm5                  \n"
     "vbroadcastf128 %6,%%ymm6                  \n"
     "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub       %1,%2                           \n"
+    "sub        %1,%2                          \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
@@ -981,7 +960,7 @@
     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "lea        " MEMLEA(0x80,0) ",%0          \n"
     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
@@ -1004,9 +983,9 @@
 
     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
+    "lea        " MEMLEA(0x10,1) ",%1          \n"
+    "sub        $0x20,%3                       \n"
+    "jg         1b                             \n"
     "vzeroupper                                \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
@@ -1024,15 +1003,19 @@
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_AVX2(const uint8* src_argb0,
+                       int src_stride_argb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   asm volatile (
     "vbroadcastf128 %5,%%ymm5                  \n"
     "vbroadcastf128 %6,%%ymm6                  \n"
     "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub       %1,%2                           \n"
+    "sub        %1,%2                          \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
@@ -1085,15 +1068,19 @@
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
+                        int src_stride_argb,
+                        uint8* dst_u,
+                        uint8* dst_v,
+                        int width) {
   asm volatile (
     "movdqa    %5,%%xmm3                       \n"
     "movdqa    %6,%%xmm4                       \n"
     "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
@@ -1149,15 +1136,18 @@
 #endif  // HAS_ARGBTOUVJROW_SSSE3
 
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+                          uint8* dst_u,
+                          uint8* dst_v,
                           int width) {
   asm volatile (
     "movdqa    %4,%%xmm3                       \n"
     "movdqa    %5,%%xmm4                       \n"
     "movdqa    %6,%%xmm5                       \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1209,8 +1199,9 @@
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1239,15 +1230,19 @@
   );
 }
 
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
+                       int src_stride_bgra,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   asm volatile (
     "movdqa    %5,%%xmm3                       \n"
     "movdqa    %6,%%xmm4                       \n"
     "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
@@ -1304,8 +1299,9 @@
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1338,8 +1334,9 @@
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1368,15 +1365,19 @@
   );
 }
 
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
+                       int src_stride_abgr,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   asm volatile (
     "movdqa    %5,%%xmm3                       \n"
     "movdqa    %6,%%xmm4                       \n"
     "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
@@ -1429,15 +1430,19 @@
   );
 }
 
-void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
+                       int src_stride_rgba,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   asm volatile (
     "movdqa    %5,%%xmm3                       \n"
     "movdqa    %6,%%xmm4                       \n"
     "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
@@ -1493,8 +1498,8 @@
 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
 
 // Read 8 UV from 444
-#define READYUV444                                                             \
-    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+#define READYUV444 \
+  "movq       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
@@ -1503,8 +1508,8 @@
     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
 
 // Read 4 UV from 422, upsample to 8 UV
-#define READYUV422                                                             \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+#define READYUV422 \
+  "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
@@ -1514,8 +1519,8 @@
     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                                            \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+#define READYUVA422 \
+  "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
@@ -1526,29 +1531,9 @@
     "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
     "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
 
-// Read 2 UV from 411, upsample to 8 UV.
-// reading 4 bytes is an msan violation.
-//    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"
-//    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
-// pinsrw fails with drmemory
-//  __asm pinsrw     xmm0, [esi], 0        /* U */
-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
-#define READYUV411_TEMP                                                        \
-    "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \
-    "movd       %[temp],%%xmm0                                  \n"            \
-    MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \
-    "movd       %[temp],%%xmm1                                  \n"            \
-    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "punpckldq  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
-
 // Read 4 UV from NV12, upsample to 8 UV
-#define READNV12                                                               \
-    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
+#define READNV12 \
+  "movq       " MEMACCESS([uv_buf]) ",%%xmm0                    \n"            \
     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
@@ -1556,8 +1541,8 @@
     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
 
 // Read 4 VU from NV21, upsample to 8 UV
-#define READNV21                                                               \
-    "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \
+#define READNV21 \
+  "movq       " MEMACCESS([vu_buf]) ",%%xmm0                    \n"            \
     "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
     "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
@@ -1565,24 +1550,24 @@
     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
 
 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2                                                               \
-    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \
+#define READYUY2 \
+  "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                  \n"            \
     "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
     "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
     "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
     "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
 
 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY                                                               \
-    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \
+#define READUYVY \
+  "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                  \n"            \
     "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
     "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
     "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
     "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants)                                           \
-    "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \
+#define YUVTORGB_SETUP(yuvconstants) \
+  "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8              \n"            \
     "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
     "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
@@ -1590,37 +1575,37 @@
     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
     "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                                 \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     %%xmm11,%%xmm0                                  \n"            \
-    "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     %%xmm12,%%xmm1                                  \n"            \
-    "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     %%xmm13,%%xmm2                                  \n"            \
-    "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "pmulhuw    %%xmm14,%%xmm4                                  \n"            \
-    "paddsw     %%xmm4,%%xmm0                                   \n"            \
-    "paddsw     %%xmm4,%%xmm1                                   \n"            \
-    "paddsw     %%xmm4,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB(yuvconstants)                                    \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     %%xmm11,%%xmm0                                  \n" \
+  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     %%xmm12,%%xmm1                                  \n" \
+  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     %%xmm13,%%xmm2                                  \n" \
+  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n" \
+  "psraw      $0x6,%%xmm0                                     \n" \
+  "psraw      $0x6,%%xmm1                                     \n" \
+  "psraw      $0x6,%%xmm2                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm1                                   \n" \
+  "packuswb   %%xmm2,%%xmm2                                   \n"
 #define YUVTORGB_REGS \
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
 
 #else
 #define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                                 \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+#define YUVTORGB(yuvconstants) \
+  "movdqa     %%xmm0,%%xmm1                                     \n"            \
     "movdqa     %%xmm0,%%xmm2                                   \n"            \
     "movdqa     %%xmm0,%%xmm3                                   \n"            \
     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
@@ -1646,8 +1631,8 @@
 #endif
 
 // Store 8 ARGB values.
-#define STOREARGB                                                              \
-    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
+#define STOREARGB \
+  "punpcklbw  %%xmm1,%%xmm0                                      \n"           \
     "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
     "movdqa     %%xmm0,%%xmm1                                    \n"           \
     "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
@@ -1657,8 +1642,8 @@
     "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
 
 // Store 8 RGBA values.
-#define STORERGBA                                                              \
-    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+#define STORERGBA \
+  "pcmpeqb   %%xmm5,%%xmm5                                       \n"           \
     "punpcklbw %%xmm2,%%xmm1                                     \n"           \
     "punpcklbw %%xmm0,%%xmm5                                     \n"           \
     "movdqa    %%xmm5,%%xmm0                                     \n"           \
@@ -1678,8 +1663,9 @@
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV444
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1707,8 +1693,9 @@
     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
     "sub       %[u_buf],%[v_buf]               \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     "punpcklbw %%xmm1,%%xmm0                   \n"
@@ -1728,7 +1715,7 @@
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
     [width]"+m"(width)     // %[width]
 #else
     [width]"+rm"(width)    // %[width]
@@ -1751,8 +1738,9 @@
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1777,11 +1765,13 @@
                                      uint8* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUVA422
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1792,7 +1782,7 @@
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [a_buf]"+r"(a_buf),    // %[a_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
     [width]"+m"(width)     // %[width]
 #else
     [width]"+rm"(width)    // %[width]
@@ -1801,55 +1791,22 @@
   : "memory", "cc", NACL_R14 YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
 
-#ifdef HAS_I411TOARGBROW_SSSE3
-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  int temp;
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV411_TEMP
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),        // %[y_buf]
-    [u_buf]"+r"(u_buf),        // %[u_buf]
-    [v_buf]"+r"(v_buf),        // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [temp]"=&r"(temp),         // %[temp]
-#if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)         // %[width]
-#else
-    [width]"+rm"(width)        // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif
-
 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
                                 const uint8* uv_buf,
                                 uint8* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV12
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1863,6 +1820,7 @@
     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
@@ -1870,11 +1828,13 @@
                                 uint8* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV21
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1889,17 +1849,20 @@
     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
                                 uint8* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUY2
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1914,17 +1877,20 @@
     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
                                 uint8* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READUYVY
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1939,6 +1905,7 @@
     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
@@ -1951,8 +1918,9 @@
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     STORERGBA
@@ -1972,8 +1940,8 @@
 #endif  // HAS_I422TOARGBROW_SSSE3
 
 // Read 16 UV from 444
-#define READYUV444_AVX2                                                        \
-    "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+#define READYUV444_AVX2 \
+  "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
     MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
     "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
@@ -1985,8 +1953,8 @@
     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
 
 // Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2                                                        \
-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+#define READYUV422_AVX2 \
+  "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
@@ -1998,8 +1966,8 @@
     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2                                                       \
-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+#define READYUVA422_AVX2 \
+  "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
@@ -2013,23 +1981,9 @@
     "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
     "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
 
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2                                                        \
-    "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
-
 // Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \
+#define READNV12_AVX2 \
+  "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                        \n"        \
     "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
@@ -2039,8 +1993,8 @@
     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
 
 // Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \
+#define READNV21_AVX2 \
+  "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                        \n"        \
     "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
     "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
@@ -2050,53 +2004,57 @@
     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
 
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \
+#define READYUY2_AVX2 \
+  "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                      \n"        \
     "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
     "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
     "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
     "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2                                                          \
-    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \
+#define READUYVY_AVX2 \
+  "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                     \n"        \
     "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
     "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
     "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
     "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \
-    "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \
+#define YUVTORGB_SETUP_AVX2(yuvconstants) \
+  "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8              \n"           \
     "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
     "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
     "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
     "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
     "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
     "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
-#define YUVTORGB_AVX2(yuvconstants)                                            \
-    "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \
-    "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \
-    "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \
-    "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \
-    "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \
-    "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \
-    "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \
-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+
+#define YUVTORGB_AVX2(yuvconstants)                                   \
+  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
+  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
+  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
+  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
+  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
+  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n" \
+  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
+  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
+  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
+  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+
 #define YUVTORGB_REGS_AVX2 \
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
 #else  // Convert 16 pixels: 16 UV and 16 Y.
+
 #define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB_AVX2(yuvconstants)                                            \
-    "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \
+#define YUVTORGB_AVX2(yuvconstants) \
+  "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2     \n"        \
     "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
     "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
     "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
@@ -2119,8 +2077,8 @@
 #endif
 
 // Store 16 ARGB values.
-#define STOREARGB_AVX2                                                         \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+#define STOREARGB_AVX2 \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                  \n"        \
     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
     "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
@@ -2143,8 +2101,9 @@
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV444_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2163,39 +2122,6 @@
 }
 #endif  // HAS_I444TOARGBROW_AVX2
 
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV411_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I411TOARGBROW_AVX2
-
 #if defined(HAS_I422TOARGBROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
@@ -2209,13 +2135,15 @@
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
     "sub       $0x10,%[width]                  \n"
     "jg        1b                              \n"
+
     "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
@@ -2233,17 +2161,19 @@
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               const uint8* a_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    const uint8* a_buf,
+                                    uint8* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUVA422_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2255,7 +2185,7 @@
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [a_buf]"+r"(a_buf),    // %[a_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
     [width]"+m"(width)     // %[width]
 #else
     [width]"+rm"(width)    // %[width]
@@ -2264,6 +2194,7 @@
   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_I422ALPHATOARGBROW_AVX2
 
@@ -2280,8 +2211,9 @@
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422_AVX2
     YUVTORGB_AVX2(yuvconstants)
 
@@ -2318,11 +2250,13 @@
                                uint8* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV12_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2337,6 +2271,7 @@
     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_NV12TOARGBROW_AVX2
 
@@ -2348,11 +2283,13 @@
                                uint8* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV21_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2368,6 +2305,7 @@
     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_NV21TOARGBROW_AVX2
 
@@ -2378,11 +2316,13 @@
                                uint8* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUY2_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2398,6 +2338,7 @@
     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_YUY2TOARGBROW_AVX2
 
@@ -2408,11 +2349,13 @@
                                uint8* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READUYVY_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2428,6 +2371,7 @@
     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_UYVYTOARGBROW_AVX2
 
@@ -2442,8 +2386,9 @@
     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
     "pcmpeqb   %%xmm4,%%xmm4                   \n"
     "pslld     $0x18,%%xmm4                    \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x8,0) ",%0            \n"
@@ -2491,7 +2436,7 @@
     "vpslld     $0x18,%%ymm4,%%ymm4            \n"
 
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
     "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
     "lea        " MEMLEA(0x10,0) ",%0          \n"
@@ -2525,16 +2470,16 @@
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-static uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                               7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
     "pshufb    %%xmm5,%%xmm0                   \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
@@ -2556,8 +2501,9 @@
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "vbroadcastf128 %3,%%ymm5                  \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
     "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
@@ -2578,18 +2524,20 @@
 
 #ifdef HAS_MIRRORUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorUVRow_SSSE3(const uint8* src,
+                       uint8* dst_u,
+                       uint8* dst_v,
                        int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "movdqa    %4,%%xmm1                       \n"
     "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(-0x10,0) ",%0          \n"
     "pshufb    %%xmm1,%%xmm0                   \n"
@@ -2615,8 +2563,9 @@
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
     "lea       " MEMLEA(-0x10,0) ",%0          \n"
@@ -2636,15 +2585,14 @@
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "vmovdqu    %3,%%ymm5                      \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
     "lea        " MEMLEA(0x20,1) ",%1          \n"
@@ -2662,31 +2610,34 @@
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_AVX2(const uint8* src_uv,
+                     uint8* dst_u,
+                     uint8* dst_v,
                      int width) {
   asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
-    "sub        %1,%2                            \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+    "sub        %1,%2                          \n"
+
     LABELALIGN
-  "1:                                            \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
-    "lea        " MEMLEA(0x40,0) ",%0            \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
-    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
-    "lea        " MEMLEA(0x20,1) ",%1            \n"
-    "sub        $0x20,%3                         \n"
-    "jg         1b                               \n"
-    "vzeroupper                                  \n"
+    "1:                                        \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)           //  vmovdqu %%ymm2,(%1,%2)
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x20,%3                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
   : "+r"(src_uv),     // %0
     "+r"(dst_u),      // %1
     "+r"(dst_v),      // %2
@@ -2699,30 +2650,33 @@
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_SSE2(const uint8* src_uv,
+                     uint8* dst_u,
+                     uint8* dst_v,
                      int width) {
   asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
-    "psrlw      $0x8,%%xmm5                      \n"
-    "sub        %1,%2                            \n"
+    "pcmpeqb    %%xmm5,%%xmm5                  \n"
+    "psrlw      $0x8,%%xmm5                    \n"
+    "sub        %1,%2                          \n"
+
     LABELALIGN
-  "1:                                            \n"
-    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
-    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
-    "lea        " MEMLEA(0x20,0) ",%0            \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "pand       %%xmm5,%%xmm0                    \n"
-    "pand       %%xmm5,%%xmm1                    \n"
-    "packuswb   %%xmm1,%%xmm0                    \n"
-    "psrlw      $0x8,%%xmm2                      \n"
-    "psrlw      $0x8,%%xmm3                      \n"
-    "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
-    "lea        " MEMLEA(0x10,1) ",%1            \n"
-    "sub        $0x10,%3                         \n"
-    "jg         1b                               \n"
+    "1:                                        \n"
+    "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1  \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "movdqa     %%xmm0,%%xmm2                  \n"
+    "movdqa     %%xmm1,%%xmm3                  \n"
+    "pand       %%xmm5,%%xmm0                  \n"
+    "pand       %%xmm5,%%xmm1                  \n"
+    "packuswb   %%xmm1,%%xmm0                  \n"
+    "psrlw      $0x8,%%xmm2                    \n"
+    "psrlw      $0x8,%%xmm3                    \n"
+    "packuswb   %%xmm3,%%xmm2                  \n"
+    "movdqu     %%xmm0," MEMACCESS(1) "        \n"
+    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)           //  movdqu     %%xmm2,(%1,%2)
+    "lea        " MEMLEA(0x10,1) ",%1          \n"
+    "sub        $0x10,%3                       \n"
+    "jg         1b                             \n"
   : "+r"(src_uv),     // %0
     "+r"(dst_u),      // %1
     "+r"(dst_v),      // %2
@@ -2735,25 +2689,28 @@
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_AVX2(const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_uv,
                      int width) {
   asm volatile (
-    "sub       %0,%1                             \n"
+    "sub       %0,%1                           \n"
+
     LABELALIGN
-  "1:                                            \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
-    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
-    "lea       " MEMLEA(0x20,0) ",%0             \n"
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
-    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
+    "1:                                        \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)           //  vmovdqu (%0,%1,1),%%ymm1
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
+    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
+    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
     "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
     "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
     "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
-    "lea       " MEMLEA(0x40,2) ",%2             \n"
-    "sub       $0x20,%3                          \n"
-    "jg        1b                                \n"
-    "vzeroupper                                  \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
   : "+r"(src_u),     // %0
     "+r"(src_v),     // %1
     "+r"(dst_uv),    // %2
@@ -2766,23 +2723,26 @@
 #endif  // HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_SSE2(const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_uv,
                      int width) {
   asm volatile (
-    "sub       %0,%1                             \n"
+    "sub       %0,%1                           \n"
+
     LABELALIGN
-  "1:                                            \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm2                     \n"
-    "punpcklbw %%xmm1,%%xmm0                     \n"
-    "punpckhbw %%xmm1,%%xmm2                     \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
-    "lea       " MEMLEA(0x20,2) ",%2             \n"
-    "sub       $0x10,%3                          \n"
-    "jg        1b                                \n"
+    "1:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm2                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
+    "lea       " MEMLEA(0x20,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
   : "+r"(src_u),     // %0
     "+r"(src_v),     // %1
     "+r"(dst_uv),    // %2
@@ -2801,8 +2761,9 @@
     "jne        2f                             \n"
     "test       $0xf,%1                        \n"
     "jne        2f                             \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -2812,6 +2773,7 @@
     "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
     "jmp       9f                              \n"
+
     LABELALIGN
   "2:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
@@ -2837,7 +2799,7 @@
 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   asm volatile (
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
     "lea       " MEMLEA(0x40,0) ",%0           \n"
@@ -2860,14 +2822,12 @@
 // Multiple of 1.
 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep movsb " MEMMOVESTRING(0,1) "          \n"
-  : "+S"(src),  // %0
-    "+D"(dst),  // %1
-    "+c"(width_tmp) // %2
-  :
-  : "memory", "cc"
-  );
+  asm volatile("rep movsb " MEMMOVESTRING(0, 1) "          \n"
+               : "+S"(src),       // %0
+                 "+D"(dst),       // %1
+                 "+c"(width_tmp)  // %2
+               :
+               : "memory", "cc");
 }
 #endif  // HAS_COPYROW_ERMS
 
@@ -2879,8 +2839,9 @@
     "pslld     $0x18,%%xmm0                    \n"
     "pcmpeqb   %%xmm1,%%xmm1                   \n"
     "psrld     $0x8,%%xmm1                     \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -2913,8 +2874,9 @@
   asm volatile (
     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
     "lea       " MEMLEA(0x40,0) ",%0           \n"
@@ -2939,9 +2901,9 @@
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 // width in pixels
 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
+  asm volatile (
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
     "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
     "lea       " MEMLEA(0x20, 0) ", %0         \n"
@@ -2963,6 +2925,47 @@
 }
 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
 
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
+    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
+  asm volatile (
+    "vmovdqa    %3,%%ymm4                      \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   " MEMACCESS(0) ", %%ymm0        \n"
+    "vmovdqu   " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
+    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n" // vpsrld $0x18, %%ymm0
+    "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
+    "vmovdqu   " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
+    "vmovdqu   " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
+    "lea       " MEMLEA(0x80, 0) ", %0         \n"
+    "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
+    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub        $0x20, %2                      \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_a),     // %1
+    "+rm"(width)     // %2
+  : "m"(kPermdARGBToY_AVX),  // %3
+    "m"(kShuffleAlphaShort_AVX2)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
@@ -2971,8 +2974,9 @@
     "pslld     $0x18,%%xmm0                    \n"
     "pcmpeqb   %%xmm1,%%xmm1                   \n"
     "psrld     $0x8,%%xmm1                     \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movq      " MEMACCESS(0) ",%%xmm2         \n"
     "lea       " MEMLEA(0x8,0) ",%0            \n"
     "punpcklbw %%xmm2,%%xmm2                   \n"
@@ -3007,8 +3011,9 @@
   asm volatile (
     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
     "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
@@ -3036,32 +3041,29 @@
 void SetRow_X86(uint8* dst, uint8 v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
   const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
+  asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
+               : "+D"(dst),       // %0
+                 "+c"(width_tmp)  // %1
+               : "a"(v32)         // %2
+               : "memory", "cc");
 }
 
 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosb " MEMSTORESTRING(al,0) "        \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v8)          // %2
-    : "memory", "cc");
+  asm volatile("rep stosb " MEMSTORESTRING(al, 0) "        \n"
+               : "+D"(dst),       // %0
+                 "+c"(width_tmp)  // %1
+               : "a"(v8)          // %2
+               : "memory", "cc");
 }
 
 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst_argb),  // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
+  asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
+               : "+D"(dst_argb),  // %0
+                 "+c"(width_tmp)  // %1
+               : "a"(v32)         // %2
+               : "memory", "cc");
 }
 #endif  // HAS_SETROW_X86
 
@@ -3070,8 +3072,9 @@
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -3091,14 +3094,18 @@
   );
 }
 
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+                      int stride_yuy2,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
@@ -3130,13 +3137,16 @@
 }
 
 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
+                         uint8* dst_u,
+                         uint8* dst_v,
+                         int width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -3166,7 +3176,7 @@
 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
   asm volatile (
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -3186,14 +3196,18 @@
   );
 }
 
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+                      int stride_uyvy,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
@@ -3225,13 +3239,16 @@
 }
 
 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
+                         uint8* dst_u,
+                         uint8* dst_v,
+                         int width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -3264,8 +3281,9 @@
   asm volatile (
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
     "lea       " MEMLEA(0x40,0) ",%0           \n"
@@ -3287,14 +3305,18 @@
   );
 }
 
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+                      int stride_yuy2,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
@@ -3327,13 +3349,16 @@
 }
 
 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
+                         uint8* dst_u,
+                         uint8* dst_v,
+                         int width) {
   asm volatile (
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
     "lea       " MEMLEA(0x40,0) ",%0           \n"
@@ -3366,7 +3391,7 @@
 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
   asm volatile (
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
     "lea       " MEMLEA(0x40,0) ",%0           \n"
@@ -3387,15 +3412,18 @@
     , "xmm0", "xmm1", "xmm5"
   );
 }
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+                      int stride_uyvy,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
     "sub       %1,%2                           \n"
 
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
@@ -3428,13 +3456,16 @@
 }
 
 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
+                         uint8* dst_u,
+                         uint8* dst_v,
+                         int width) {
   asm volatile (
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
     "lea       " MEMLEA(0x40,0) ",%0           \n"
@@ -3467,14 +3498,14 @@
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
-static uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
+static uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                              11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
+void ARGBBlendRow_SSSE3(const uint8* src_argb0,
+                        const uint8* src_argb1,
+                        uint8* dst_argb,
+                        int width) {
   asm volatile (
     "pcmpeqb   %%xmm7,%%xmm7                   \n"
     "psrlw     $0xf,%%xmm7                     \n"
@@ -3559,46 +3590,49 @@
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                  \n"
-    "psllw      $0x8,%%xmm5                    \n"
-    "mov        $0x80808080,%%eax              \n"
-    "movd       %%eax,%%xmm6                   \n"
-    "pshufd     $0x0,%%xmm6,%%xmm6             \n"
-    "mov        $0x807f807f,%%eax              \n"
-    "movd       %%eax,%%xmm7                   \n"
-    "pshufd     $0x0,%%xmm7,%%xmm7             \n"
-    "sub        %2,%0                          \n"
-    "sub        %2,%1                          \n"
-    "sub        %2,%3                          \n"
+void BlendPlaneRow_SSSE3(const uint8* src0,
+                         const uint8* src1,
+                         const uint8* alpha,
+                         uint8* dst,
+                         int width) {
+  asm volatile(
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "psllw      $0x8,%%xmm5                    \n"
+      "mov        $0x80808080,%%eax              \n"
+      "movd       %%eax,%%xmm6                   \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6             \n"
+      "mov        $0x807f807f,%%eax              \n"
+      "movd       %%eax,%%xmm7                   \n"
+      "pshufd     $0x0,%%xmm7,%%xmm7             \n"
+      "sub        %2,%0                          \n"
+      "sub        %2,%1                          \n"
+      "sub        %2,%3                          \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq       (%2),%%xmm0                    \n"
-    "punpcklbw  %%xmm0,%%xmm0                  \n"
-    "pxor       %%xmm5,%%xmm0                  \n"
-    "movq       (%0,%2,1),%%xmm1               \n"
-    "movq       (%1,%2,1),%%xmm2               \n"
-    "punpcklbw  %%xmm2,%%xmm1                  \n"
-    "psubb      %%xmm6,%%xmm1                  \n"
-    "pmaddubsw  %%xmm1,%%xmm0                  \n"
-    "paddw      %%xmm7,%%xmm0                  \n"
-    "psrlw      $0x8,%%xmm0                    \n"
-    "packuswb   %%xmm0,%%xmm0                  \n"
-    "movq       %%xmm0,(%3,%2,1)               \n"
-    "lea        0x8(%2),%2                     \n"
-    "sub        $0x8,%4                        \n"
-    "jg        1b                              \n"
-  : "+r"(src0),       // %0
-    "+r"(src1),       // %1
-    "+r"(alpha),      // %2
-    "+r"(dst),        // %3
-    "+rm"(width)      // %4
-  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq       (%2),%%xmm0                    \n"
+      "punpcklbw  %%xmm0,%%xmm0                  \n"
+      "pxor       %%xmm5,%%xmm0                  \n"
+      "movq       (%0,%2,1),%%xmm1               \n"
+      "movq       (%1,%2,1),%%xmm2               \n"
+      "punpcklbw  %%xmm2,%%xmm1                  \n"
+      "psubb      %%xmm6,%%xmm1                  \n"
+      "pmaddubsw  %%xmm1,%%xmm0                  \n"
+      "paddw      %%xmm7,%%xmm0                  \n"
+      "psrlw      $0x8,%%xmm0                    \n"
+      "packuswb   %%xmm0,%%xmm0                  \n"
+      "movq       %%xmm0,(%3,%2,1)               \n"
+      "lea        0x8(%2),%2                     \n"
+      "sub        $0x8,%4                        \n"
+      "jg        1b                              \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_BLENDPLANEROW_SSSE3
 
@@ -3608,67 +3642,67 @@
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                        const uint8* alpha, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
-    "mov        $0x80808080,%%eax              \n"
-    "vmovd      %%eax,%%xmm6                   \n"
-    "vbroadcastss %%xmm6,%%ymm6                \n"
-    "mov        $0x807f807f,%%eax              \n"
-    "vmovd      %%eax,%%xmm7                   \n"
-    "vbroadcastss %%xmm7,%%ymm7                \n"
-    "sub        %2,%0                          \n"
-    "sub        %2,%1                          \n"
-    "sub        %2,%3                          \n"
+void BlendPlaneRow_AVX2(const uint8* src0,
+                        const uint8* src1,
+                        const uint8* alpha,
+                        uint8* dst,
+                        int width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
+      "mov        $0x80808080,%%eax              \n"
+      "vmovd      %%eax,%%xmm6                   \n"
+      "vbroadcastss %%xmm6,%%ymm6                \n"
+      "mov        $0x807f807f,%%eax              \n"
+      "vmovd      %%eax,%%xmm7                   \n"
+      "vbroadcastss %%xmm7,%%ymm7                \n"
+      "sub        %2,%0                          \n"
+      "sub        %2,%1                          \n"
+      "sub        %2,%3                          \n"
 
-    // 32 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%2),%%ymm0                    \n"
-    "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vmovdqu    (%0,%2,1),%%ymm1               \n"
-    "vmovdqu    (%1,%2,1),%%ymm2               \n"
-    "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
-    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
-    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0,(%3,%2,1)               \n"
-    "lea        0x20(%2),%2                    \n"
-    "sub        $0x20,%4                       \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src0),       // %0
-    "+r"(src1),       // %1
-    "+r"(alpha),      // %2
-    "+r"(dst),        // %3
-    "+rm"(width)      // %4
-  :: "memory", "cc", "eax",
-     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 32 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%2),%%ymm0                    \n"
+      "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vmovdqu    (%0,%2,1),%%ymm1               \n"
+      "vmovdqu    (%1,%2,1),%%ymm2               \n"
+      "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
+      "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+      "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
+      "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%3,%2,1)               \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x20,%4                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_BLENDPLANEROW_AVX2
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha
-static uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
-};
-static uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
-};
+static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+                               7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+                               15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
 // Attenuate 4 pixels at a time.
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
@@ -3679,7 +3713,7 @@
 
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
@@ -3714,9 +3748,9 @@
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
 // Attenuate 8 pixels at a time.
 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
@@ -3727,7 +3761,7 @@
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
@@ -3757,13 +3791,14 @@
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
+                             uint8* dst_argb,
                              int width) {
   uintptr_t alpha;
   asm volatile (
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
@@ -3804,10 +3839,10 @@
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
 // Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
+                             uint8* dst_argb,
                              int width) {
   uintptr_t alpha;
   asm volatile (
@@ -3816,7 +3851,7 @@
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     // replace VPGATHER
     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
@@ -3879,7 +3914,7 @@
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
@@ -3922,17 +3957,14 @@
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 // Constant for ARGB color to sepia tone
-static vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                             17, 68, 35, 0, 17, 68, 35, 0};
 
-static vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                             22, 88, 45, 0, 22, 88, 45, 0};
 
-static vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                             24, 98, 50, 0, 24, 98, 50, 0};
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
@@ -3943,7 +3975,7 @@
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
@@ -3995,8 +4027,10 @@
 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
+                              uint8* dst_argb,
+                              const int8* matrix_argb,
+                              int width) {
   asm volatile (
     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
     "pshufd    $0x00,%%xmm5,%%xmm2             \n"
@@ -4006,7 +4040,7 @@
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
@@ -4058,8 +4092,11 @@
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
+void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
   asm volatile (
     "movd      %2,%%xmm2                       \n"
     "movd      %3,%%xmm3                       \n"
@@ -4076,7 +4113,7 @@
 
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "pmulhuw   %%xmm2,%%xmm0                   \n"
@@ -4108,7 +4145,9 @@
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_SSE2(const uint8* src_argb,
+                       uint8* dst_argb,
+                       int width,
                        uint32 value) {
   asm volatile (
     "movd      %3,%%xmm2                       \n"
@@ -4117,7 +4156,7 @@
 
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -4144,14 +4183,16 @@
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
+                          const uint8* src_argb1,
+                          uint8* dst_argb,
+                          int width) {
   asm volatile (
-    "pxor      %%xmm5,%%xmm5                  \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
 
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
@@ -4182,14 +4223,16 @@
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
+                          const uint8* src_argb1,
+                          uint8* dst_argb,
+                          int width) {
   asm volatile (
     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
 
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
     "lea        " MEMLEA(0x20,0) ",%0          \n"
     "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
@@ -4221,12 +4264,14 @@
 
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+void ARGBAddRow_SSE2(const uint8* src_argb0,
+                     const uint8* src_argb1,
+                     uint8* dst_argb,
+                     int width) {
   asm volatile (
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
@@ -4249,12 +4294,14 @@
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+void ARGBAddRow_AVX2(const uint8* src_argb0,
+                     const uint8* src_argb1,
+                     uint8* dst_argb,
+                     int width) {
   asm volatile (
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     "lea        " MEMLEA(0x20,0) ",%0          \n"
     "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
@@ -4277,12 +4324,14 @@
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+void ARGBSubtractRow_SSE2(const uint8* src_argb0,
+                          const uint8* src_argb1,
+                          uint8* dst_argb,
+                          int width) {
   asm volatile (
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
@@ -4305,12 +4354,14 @@
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+void ARGBSubtractRow_AVX2(const uint8* src_argb0,
+                          const uint8* src_argb1,
+                          uint8* dst_argb,
+                          int width) {
   asm volatile (
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     "lea        " MEMLEA(0x20,0) ",%0          \n"
     "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
@@ -4318,7 +4369,7 @@
     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
     "lea        " MEMLEA(0x20,2) ",%2          \n"
     "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
+    "jg         1b                             \n"
     "vzeroupper                                \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4336,8 +4387,11 @@
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_SSE2(const uint8* src_y0,
+                    const uint8* src_y1,
+                    const uint8* src_y2,
+                    uint8* dst_sobelx,
+                    int width) {
   asm volatile (
     "sub       %0,%1                           \n"
     "sub       %0,%2                           \n"
@@ -4346,7 +4400,7 @@
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
@@ -4390,8 +4444,10 @@
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
+void SobelYRow_SSE2(const uint8* src_y0,
+                    const uint8* src_y1,
+                    uint8* dst_sobely,
+                    int width) {
   asm volatile (
     "sub       %0,%1                           \n"
     "sub       %0,%2                           \n"
@@ -4399,7 +4455,7 @@
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
     "punpcklbw %%xmm5,%%xmm0                   \n"
@@ -4443,8 +4499,10 @@
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
+void SobelRow_SSE2(const uint8* src_sobelx,
+                   const uint8* src_sobely,
+                   uint8* dst_argb,
+                   int width) {
   asm volatile (
     "sub       %0,%1                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -4452,7 +4510,7 @@
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
@@ -4490,8 +4548,10 @@
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
+                          const uint8* src_sobely,
+                          uint8* dst_y,
+                          int width) {
   asm volatile (
     "sub       %0,%1                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -4499,7 +4559,7 @@
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
@@ -4525,15 +4585,17 @@
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+void SobelXYRow_SSE2(const uint8* src_sobelx,
+                     const uint8* src_sobely,
+                     uint8* dst_argb,
+                     int width) {
   asm volatile (
     "sub       %0,%1                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
 
     // 8 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
@@ -4572,8 +4634,10 @@
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
+void ComputeCumulativeSumRow_SSE2(const uint8* row,
+                                  int32* cumsum,
+                                  const int32* previous_cumsum,
+                                  int width) {
   asm volatile (
     "pxor      %%xmm0,%%xmm0                   \n"
     "pxor      %%xmm1,%%xmm1                   \n"
@@ -4582,9 +4646,9 @@
     "test      $0xf,%1                         \n"
     "jne       49f                             \n"
 
-  // 4 pixel loop                              \n"
+    // 4 pixel loop.
     LABELALIGN
-  "40:                                         \n"
+    "40:                                       \n"
     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm2,%%xmm4                   \n"
@@ -4617,13 +4681,13 @@
     "sub       $0x4,%3                         \n"
     "jge       40b                             \n"
 
-  "49:                                         \n"
+    "49:                                       \n"
     "add       $0x3,%3                         \n"
     "jl        19f                             \n"
 
-  // 1 pixel loop                              \n"
+    // 1 pixel loop.
     LABELALIGN
-  "10:                                         \n"
+    "10:                                       \n"
     "movd      " MEMACCESS(0) ",%%xmm2         \n"
     "lea       " MEMLEA(0x4,0) ",%0            \n"
     "punpcklbw %%xmm1,%%xmm2                   \n"
@@ -4637,7 +4701,7 @@
     "sub       $0x1,%3                         \n"
     "jge       10b                             \n"
 
-  "19:                                         \n"
+    "19:                                       \n"
   : "+r"(row),  // %0
     "+r"(cumsum),  // %1
     "+r"(previous_cumsum),  // %2
@@ -4650,8 +4714,11 @@
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
 
 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
+void CumulativeSumToAverageRow_SSE2(const int32* topleft,
+                                    const int32* botleft,
+                                    int width,
+                                    int area,
+                                    uint8* dst,
                                     int count) {
   asm volatile (
     "movd      %5,%%xmm5                       \n"
@@ -4672,7 +4739,7 @@
     "cvtps2dq  %%xmm5,%%xmm5                   \n"
     "packssdw  %%xmm5,%%xmm5                   \n"
 
-  // 4 pixel small loop                        \n"
+    // 4 pixel small loop.
     LABELALIGN
   "4:                                         \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
@@ -4783,8 +4850,11 @@
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* src_dudv, int width) {
+void ARGBAffineRow_SSE2(const uint8* src_argb,
+                        int src_argb_stride,
+                        uint8* dst_argb,
+                        const float* src_dudv,
+                        int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
   intptr_t temp;
   asm volatile (
@@ -4868,8 +4938,10 @@
 
 #ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_SSSE3(uint8* dst_ptr,
+                          const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
                           int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
@@ -4891,7 +4963,7 @@
 
     // General purpose row blend.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,1,4,1,xmm2)
     "movdqa     %%xmm0,%%xmm1                  \n"
@@ -4949,8 +5021,10 @@
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_AVX2(uint8* dst_ptr,
+                         const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction) {
   asm volatile (
     "cmp       $0x0,%3                         \n"
@@ -4972,7 +5046,7 @@
 
     // General purpose row blend.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
     MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
     "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
@@ -5025,12 +5099,14 @@
 
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width) {
+void ARGBShuffleRow_SSSE3(const uint8* src_argb,
+                          uint8* dst_argb,
+                          const uint8* shuffler,
+                          int width) {
   asm volatile (
     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -5053,12 +5129,14 @@
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
+void ARGBShuffleRow_AVX2(const uint8* src_argb,
+                         uint8* dst_argb,
+                         const uint8* shuffler,
+                         int width) {
   asm volatile (
     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
     "lea       " MEMLEA(0x40,0) ",%0           \n"
@@ -5082,8 +5160,10 @@
 
 #ifdef HAS_ARGBSHUFFLEROW_SSE2
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
+void ARGBShuffleRow_SSE2(const uint8* src_argb,
+                         uint8* dst_argb,
+                         const uint8* shuffler,
+                         int width) {
   uintptr_t pixel_temp;
   asm volatile (
     "pxor      %%xmm5,%%xmm5                   \n"
@@ -5098,7 +5178,7 @@
     "je        2103f                           \n"
 
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movzb     " MEMACCESS(4) ",%2             \n"
     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
     "mov       %b2," MEMACCESS(1) "            \n"
@@ -5204,11 +5284,12 @@
 void I422ToYUY2Row_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
+                        uint8* dst_frame,
+                        int width) {
+  asm volatile (
     "sub       %1,%2                             \n"
     LABELALIGN
-  "1:                                            \n"
+    "1:                                        \n"
     "movq      " MEMACCESS(1) ",%%xmm2           \n"
     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
     "lea       " MEMLEA(0x8,1) ",%1              \n"
@@ -5239,11 +5320,12 @@
 void I422ToUYVYRow_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
+                        uint8* dst_frame,
+                        int width) {
+  asm volatile (
     "sub        %1,%2                            \n"
     LABELALIGN
-  "1:                                            \n"
+    "1:                                        \n"
     "movq      " MEMACCESS(1) ",%%xmm2           \n"
     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
     "lea       " MEMLEA(0x8,1) ",%1              \n"
@@ -5272,14 +5354,15 @@
 
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
+                            uint8* dst_argb,
+                            const float* poly,
                             int width) {
   asm volatile (
     "pxor      %%xmm3,%%xmm3                   \n"
 
     // 2 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x8,0) ",%0            \n"
     "punpcklbw %%xmm3,%%xmm0                   \n"
@@ -5328,7 +5411,8 @@
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
+                            uint8* dst_argb,
+                            const float* poly,
                             int width) {
   asm volatile (
     "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
@@ -5338,7 +5422,7 @@
 
     // 2 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
     "lea         " MEMLEA(0x8,0) ",%0          \n"
     "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
@@ -5366,15 +5450,150 @@
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+    "pshufd      $0x0,%3,%%xmm4                \n"
+    "pxor        %%xmm5,%%xmm5                 \n"
+    "sub         %0,%1                         \n"
+
+    // 16 pixel loop.
+    LABELALIGN
+    "1:                                        \n"
+    "movdqu      " MEMACCESS(0) ",%%xmm2       \n"  // 8 shorts
+    "add         $0x10,%0                      \n"
+    "movdqa      %%xmm2,%%xmm3                 \n"
+    "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
+    "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
+    "punpckhwd   %%xmm5,%%xmm3                 \n"
+    "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+    "mulps       %%xmm4,%%xmm2                 \n"
+    "mulps       %%xmm4,%%xmm3                 \n"
+    "psrld       $0xd,%%xmm2                   \n"
+    "psrld       $0xd,%%xmm3                   \n"
+    "packssdw    %%xmm3,%%xmm2                 \n"
+    MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
+    "sub         $0x8,%2                       \n"
+    "jg          1b                            \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "x"(scale * kScaleBias)   // %3
+  : "memory", "cc",
+    "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+    "vbroadcastss  %3, %%ymm4                  \n"
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+    "sub        %0,%1                          \n"
+
+    // 16 pixel loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm2        \n"  // 16 shorts
+    "add        $0x20,%0                       \n"
+    "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
+    "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
+    "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
+    "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
+    "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
+    "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
+    "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
+    "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
+    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
+    MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
+    "sub        $0x10,%2                       \n"
+    "jg         1b                             \n"
+
+    "vzeroupper                                \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "x"(scale * kScaleBias)   // %3
+  : "memory", "cc",
+    "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+    "vbroadcastss  %3, %%ymm4                  \n"
+    "sub        %0,%1                          \n"
+
+    // 16 pixel loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
+    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
+    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+    "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+    "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+    MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
+    MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
+    "add         $0x20,%0                      \n"
+    "sub         $0x10,%2                      \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  : "x"(scale)   // %3
+  : "memory", "cc",
+    "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
+  asm volatile (
+    "sub        %0,%1                          \n"
+    // 16 pixel loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
+    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
+    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+    MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
+    MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
+    "add         $0x20,%0                      \n"
+    "sub         $0x10,%2                      \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc",
+    "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+void ARGBColorTableRow_X86(uint8* dst_argb,
+                           const uint8* table_argb,
                            int width) {
   uintptr_t pixel_temp;
   asm volatile (
     // 1 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movzb     " MEMACCESS(0) ",%1             \n"
     "lea       " MEMLEA(0x4,0) ",%0            \n"
     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
@@ -5405,7 +5624,7 @@
   asm volatile (
     // 1 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movzb     " MEMACCESS(0) ",%1             \n"
     "lea       " MEMLEA(0x4,0) ",%0            \n"
     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
@@ -5428,9 +5647,11 @@
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
+                                 uint8* dst_argb,
                                  int width,
-                                 const uint8* luma, uint32 lumacoeff) {
+                                 const uint8* luma,
+                                 uint32 lumacoeff) {
   uintptr_t pixel_temp;
   uintptr_t table_temp;
   asm volatile (
@@ -5442,7 +5663,7 @@
 
     // 4 pixel loop.
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
     "pmaddubsw %%xmm3,%%xmm0                   \n"
     "phaddw    %%xmm0,%%xmm0                   \n"

diff --git a/files/source/row_mips.cc b/files/source/row_mips.cc
deleted file mode 100644
index 285f0b5..0000000
--- a/files/source/row_mips.cc
+++ /dev/null

@@ -1,782 +0,0 @@
-/*
- *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-#ifdef HAS_COPYROW_MIPS
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
-  __asm__ __volatile__ (
-    ".set      noreorder                         \n"
-    ".set      noat                              \n"
-    "slti      $at, %[count], 8                  \n"
-    "bne       $at ,$zero, $last8                \n"
-    "xor       $t8, %[src], %[dst]               \n"
-    "andi      $t8, $t8, 0x3                     \n"
-
-    "bne       $t8, $zero, unaligned             \n"
-    "negu      $a3, %[dst]                       \n"
-    // make dst/src aligned
-    "andi      $a3, $a3, 0x3                     \n"
-    "beq       $a3, $zero, $chk16w               \n"
-    // word-aligned now count is the remining bytes count
-    "subu     %[count], %[count], $a3            \n"
-
-    "lwr       $t8, 0(%[src])                    \n"
-    "addu      %[src], %[src], $a3               \n"
-    "swr       $t8, 0(%[dst])                    \n"
-    "addu      %[dst], %[dst], $a3               \n"
-
-    // Now the dst/src are mutually word-aligned with word-aligned addresses
-    "$chk16w:                                    \n"
-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-    // t8 is the byte count after 64-byte chunks
-    "beq       %[count], $t8, chk8w              \n"
-    // There will be at most 1 32-byte chunk after it
-    "subu      $a3, %[count], $t8                \n"  // the reminder
-    // Here a3 counts bytes in 16w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // Now a3 is the final dst after 64-byte chunks
-    "addu      $t0, %[dst], %[count]             \n"
-    // t0 is the "past the end" address
-
-    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
-    // the "t0-32" address
-    // This means: for x=128 the last "safe" a1 address is "t0-160"
-    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
-    // we will use "pref 30,128(a1)", so "t0-160" is the limit
-    "subu      $t9, $t0, 160                     \n"
-    // t9 is the "last safe pref 30,128(a1)" address
-    "pref      0, 0(%[src])                      \n"  // first line of src
-    "pref      0, 32(%[src])                     \n"  // second line of src
-    "pref      0, 64(%[src])                     \n"
-    "pref      30, 32(%[dst])                    \n"
-    // In case the a1 > t9 don't use "pref 30" at all
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bgtz      $v1, $loop16w                     \n"
-    "nop                                         \n"
-    // otherwise, start with using pref30
-    "pref      30, 64(%[dst])                    \n"
-    "$loop16w:                                    \n"
-    "pref      0, 96(%[src])                     \n"
-    "lw        $t0, 0(%[src])                    \n"
-    "bgtz      $v1, $skip_pref30_96              \n"  // skip
-    "lw        $t1, 4(%[src])                    \n"
-    "pref      30, 96(%[dst])                    \n"  // continue
-    "$skip_pref30_96:                            \n"
-    "lw        $t2, 8(%[src])                    \n"
-    "lw        $t3, 12(%[src])                   \n"
-    "lw        $t4, 16(%[src])                   \n"
-    "lw        $t5, 20(%[src])                   \n"
-    "lw        $t6, 24(%[src])                   \n"
-    "lw        $t7, 28(%[src])                   \n"
-    "pref      0, 128(%[src])                    \n"
-    //  bring the next lines of src, addr 128
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "lw        $t0, 32(%[src])                   \n"
-    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
-    "lw        $t1, 36(%[src])                   \n"
-    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
-    "$skip_pref30_128:                           \n"
-    "lw        $t2, 40(%[src])                   \n"
-    "lw        $t3, 44(%[src])                   \n"
-    "lw        $t4, 48(%[src])                   \n"
-    "lw        $t5, 52(%[src])                   \n"
-    "lw        $t6, 56(%[src])                   \n"
-    "lw        $t7, 60(%[src])                   \n"
-    "pref      0, 160(%[src])                    \n"
-    // bring the next lines of src, addr 160
-    "sw        $t0, 32(%[dst])                   \n"
-    "sw        $t1, 36(%[dst])                   \n"
-    "sw        $t2, 40(%[dst])                   \n"
-    "sw        $t3, 44(%[dst])                   \n"
-    "sw        $t4, 48(%[dst])                   \n"
-    "sw        $t5, 52(%[dst])                   \n"
-    "sw        $t6, 56(%[dst])                   \n"
-    "sw        $t7, 60(%[dst])                   \n"
-
-    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bne       %[dst], $a3, $loop16w             \n"
-    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
-    "move      %[count], $t8                     \n"
-
-    // Here we have src and dest word-aligned but less than 64-bytes to go
-
-    "chk8w:                                      \n"
-    "pref      0, 0x0(%[src])                    \n"
-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-    // the t8 is the reminder count past 32-bytes
-    "beq       %[count], $t8, chk1w              \n"
-    // count=t8,no 32-byte chunk
-    " nop                                        \n"
-
-    "lw        $t0, 0(%[src])                    \n"
-    "lw        $t1, 4(%[src])                    \n"
-    "lw        $t2, 8(%[src])                    \n"
-    "lw        $t3, 12(%[src])                   \n"
-    "lw        $t4, 16(%[src])                   \n"
-    "lw        $t5, 20(%[src])                   \n"
-    "lw        $t6, 24(%[src])                   \n"
-    "lw        $t7, 28(%[src])                   \n"
-    "addiu     %[src], %[src], 32                \n"
-
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "addiu     %[dst], %[dst], 32                \n"
-
-    "chk1w:                                      \n"
-    "andi      %[count], $t8, 0x3                \n"
-    // now count is the reminder past 1w chunks
-    "beq       %[count], $t8, $last8             \n"
-    " subu     $a3, $t8, %[count]                \n"
-    // a3 is count of bytes in 1w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // now a3 is the dst address past the 1w chunks
-    // copying in words (4-byte chunks)
-    "$wordCopy_loop:                             \n"
-    "lw        $t3, 0(%[src])                    \n"
-    // the first t3 may be equal t0 ... optimize?
-    "addiu     %[src], %[src],4                  \n"
-    "addiu     %[dst], %[dst],4                  \n"
-    "bne       %[dst], $a3,$wordCopy_loop        \n"
-    " sw       $t3, -4(%[dst])                   \n"
-
-    // For the last (<8) bytes
-    "$last8:                                     \n"
-    "blez      %[count], leave                   \n"
-    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
-    "$last8loop:                                 \n"
-    "lb        $v1, 0(%[src])                    \n"
-    "addiu     %[src], %[src], 1                 \n"
-    "addiu     %[dst], %[dst], 1                 \n"
-    "bne       %[dst], $a3, $last8loop           \n"
-    " sb       $v1, -1(%[dst])                   \n"
-
-    "leave:                                      \n"
-    "  j       $ra                               \n"
-    "  nop                                       \n"
-
-    //
-    // UNALIGNED case
-    //
-
-    "unaligned:                                  \n"
-    // got here with a3="negu a1"
-    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
-    "beqz      $a3, $ua_chk16w                   \n"
-    " subu     %[count], %[count], $a3           \n"
-    // bytes left after initial a3 bytes
-    "lwr       $v1, 0(%[src])                    \n"
-    "lwl       $v1, 3(%[src])                    \n"
-    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
-    "swr       $v1, 0(%[dst])                    \n"
-    "addu      %[dst], %[dst], $a3               \n"
-    // below the dst will be word aligned (NOTE1)
-    "$ua_chk16w:                                 \n"
-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-    // t8 is the byte count after 64-byte chunks
-    "beq       %[count], $t8, ua_chk8w           \n"
-    // if a2==t8, no 64-byte chunks
-    // There will be at most 1 32-byte chunk after it
-    "subu      $a3, %[count], $t8                \n"  // the reminder
-    // Here a3 counts bytes in 16w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // Now a3 is the final dst after 64-byte chunks
-    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
-    "subu      $t9, $t0, 160                     \n"
-    // t9 is the "last safe pref 30,128(a1)" address
-    "pref      0, 0(%[src])                      \n"  // first line of src
-    "pref      0, 32(%[src])                     \n"  // second line  addr 32
-    "pref      0, 64(%[src])                     \n"
-    "pref      30, 32(%[dst])                    \n"
-    // safe, as we have at least 64 bytes ahead
-    // In case the a1 > t9 don't use "pref 30" at all
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bgtz      $v1, $ua_loop16w                  \n"
-    // skip "pref 30,64(a1)" for too short arrays
-    " nop                                        \n"
-    // otherwise, start with using pref30
-    "pref      30, 64(%[dst])                    \n"
-    "$ua_loop16w:                                \n"
-    "pref      0, 96(%[src])                     \n"
-    "lwr       $t0, 0(%[src])                    \n"
-    "lwl       $t0, 3(%[src])                    \n"
-    "lwr       $t1, 4(%[src])                    \n"
-    "bgtz      $v1, $ua_skip_pref30_96           \n"
-    " lwl      $t1, 7(%[src])                    \n"
-    "pref      30, 96(%[dst])                    \n"
-    // continue setting up the dest, addr 96
-    "$ua_skip_pref30_96:                         \n"
-    "lwr       $t2, 8(%[src])                    \n"
-    "lwl       $t2, 11(%[src])                   \n"
-    "lwr       $t3, 12(%[src])                   \n"
-    "lwl       $t3, 15(%[src])                   \n"
-    "lwr       $t4, 16(%[src])                   \n"
-    "lwl       $t4, 19(%[src])                   \n"
-    "lwr       $t5, 20(%[src])                   \n"
-    "lwl       $t5, 23(%[src])                   \n"
-    "lwr       $t6, 24(%[src])                   \n"
-    "lwl       $t6, 27(%[src])                   \n"
-    "lwr       $t7, 28(%[src])                   \n"
-    "lwl       $t7, 31(%[src])                   \n"
-    "pref      0, 128(%[src])                    \n"
-    // bring the next lines of src, addr 128
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "lwr       $t0, 32(%[src])                   \n"
-    "lwl       $t0, 35(%[src])                   \n"
-    "lwr       $t1, 36(%[src])                   \n"
-    "bgtz      $v1, ua_skip_pref30_128           \n"
-    " lwl      $t1, 39(%[src])                   \n"
-    "pref      30, 128(%[dst])                   \n"
-    // continue setting up the dest, addr 128
-    "ua_skip_pref30_128:                         \n"
-
-    "lwr       $t2, 40(%[src])                   \n"
-    "lwl       $t2, 43(%[src])                   \n"
-    "lwr       $t3, 44(%[src])                   \n"
-    "lwl       $t3, 47(%[src])                   \n"
-    "lwr       $t4, 48(%[src])                   \n"
-    "lwl       $t4, 51(%[src])                   \n"
-    "lwr       $t5, 52(%[src])                   \n"
-    "lwl       $t5, 55(%[src])                   \n"
-    "lwr       $t6, 56(%[src])                   \n"
-    "lwl       $t6, 59(%[src])                   \n"
-    "lwr       $t7, 60(%[src])                   \n"
-    "lwl       $t7, 63(%[src])                   \n"
-    "pref      0, 160(%[src])                    \n"
-    // bring the next lines of src, addr 160
-    "sw        $t0, 32(%[dst])                   \n"
-    "sw        $t1, 36(%[dst])                   \n"
-    "sw        $t2, 40(%[dst])                   \n"
-    "sw        $t3, 44(%[dst])                   \n"
-    "sw        $t4, 48(%[dst])                   \n"
-    "sw        $t5, 52(%[dst])                   \n"
-    "sw        $t6, 56(%[dst])                   \n"
-    "sw        $t7, 60(%[dst])                   \n"
-
-    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
-    "sgtu      $v1,%[dst],$t9                    \n"
-    "bne       %[dst],$a3,$ua_loop16w            \n"
-    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
-    "move      %[count],$t8                      \n"
-
-    // Here we have src and dest word-aligned but less than 64-bytes to go
-
-    "ua_chk8w:                                   \n"
-    "pref      0, 0x0(%[src])                    \n"
-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-    // the t8 is the reminder count
-    "beq       %[count], $t8, $ua_chk1w          \n"
-    // when count==t8, no 32-byte chunk
-
-    "lwr       $t0, 0(%[src])                    \n"
-    "lwl       $t0, 3(%[src])                    \n"
-    "lwr       $t1, 4(%[src])                    \n"
-    "lwl       $t1, 7(%[src])                    \n"
-    "lwr       $t2, 8(%[src])                    \n"
-    "lwl       $t2, 11(%[src])                   \n"
-    "lwr       $t3, 12(%[src])                   \n"
-    "lwl       $t3, 15(%[src])                   \n"
-    "lwr       $t4, 16(%[src])                   \n"
-    "lwl       $t4, 19(%[src])                   \n"
-    "lwr       $t5, 20(%[src])                   \n"
-    "lwl       $t5, 23(%[src])                   \n"
-    "lwr       $t6, 24(%[src])                   \n"
-    "lwl       $t6, 27(%[src])                   \n"
-    "lwr       $t7, 28(%[src])                   \n"
-    "lwl       $t7, 31(%[src])                   \n"
-    "addiu     %[src], %[src], 32                \n"
-
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "addiu     %[dst], %[dst], 32                \n"
-
-    "$ua_chk1w:                                  \n"
-    "andi      %[count], $t8, 0x3                \n"
-    // now count is the reminder past 1w chunks
-    "beq       %[count], $t8, ua_smallCopy       \n"
-    "subu      $a3, $t8, %[count]                \n"
-    // a3 is count of bytes in 1w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // now a3 is the dst address past the 1w chunks
-
-    // copying in words (4-byte chunks)
-    "$ua_wordCopy_loop:                          \n"
-    "lwr       $v1, 0(%[src])                    \n"
-    "lwl       $v1, 3(%[src])                    \n"
-    "addiu     %[src], %[src], 4                 \n"
-    "addiu     %[dst], %[dst], 4                 \n"
-    // note: dst=a1 is word aligned here, see NOTE1
-    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
-    " sw       $v1,-4(%[dst])                    \n"
-
-    // Now less than 4 bytes (value in count) left to copy
-    "ua_smallCopy:                               \n"
-    "beqz      %[count], leave                   \n"
-    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
-    "$ua_smallCopy_loop:                         \n"
-    "lb        $v1, 0(%[src])                    \n"
-    "addiu     %[src], %[src], 1                 \n"
-    "addiu     %[dst], %[dst], 1                 \n"
-    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
-    " sb       $v1, -1(%[dst])                   \n"
-
-    "j         $ra                               \n"
-    " nop                                        \n"
-    ".set      at                                \n"
-    ".set      reorder                           \n"
-       : [dst] "+r" (dst), [src] "+r" (src)
-       : [count] "r" (count)
-       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
-       "t8", "t9", "a3", "v1", "at"
-  );
-}
-#endif  // HAS_COPYROW_MIPS
-
-// DSPR2 functions
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width) {
-  __asm__ __volatile__ (
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-    "srl             $t4, %[width], 4              \n"  // multiplies of 16
-    "blez            $t4, 2f                       \n"
-    " andi           %[width], %[width], 0xf       \n"  // residual
-
-  "1:                                              \n"
-    "addiu           $t4, $t4, -1                  \n"
-    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
-    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-    "addiu           %[src_uv], %[src_uv], 32      \n"
-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
-    "sw              $t9, 0(%[dst_v])              \n"
-    "sw              $t0, 0(%[dst_u])              \n"
-    "sw              $t1, 4(%[dst_v])              \n"
-    "sw              $t2, 4(%[dst_u])              \n"
-    "sw              $t3, 8(%[dst_v])              \n"
-    "sw              $t5, 8(%[dst_u])              \n"
-    "sw              $t6, 12(%[dst_v])             \n"
-    "sw              $t7, 12(%[dst_u])             \n"
-    "addiu           %[dst_v], %[dst_v], 16        \n"
-    "bgtz            $t4, 1b                       \n"
-    " addiu          %[dst_u], %[dst_u], 16        \n"
-
-    "beqz            %[width], 3f                  \n"
-    " nop                                          \n"
-
-  "2:                                              \n"
-    "lbu             $t0, 0(%[src_uv])             \n"
-    "lbu             $t1, 1(%[src_uv])             \n"
-    "addiu           %[src_uv], %[src_uv], 2       \n"
-    "addiu           %[width], %[width], -1        \n"
-    "sb              $t0, 0(%[dst_u])              \n"
-    "sb              $t1, 0(%[dst_v])              \n"
-    "addiu           %[dst_u], %[dst_u], 1         \n"
-    "bgtz            %[width], 2b                  \n"
-    " addiu          %[dst_v], %[dst_v], 1         \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-     : [src_uv] "+r" (src_uv),
-       [width] "+r" (width),
-       [dst_u] "+r" (dst_u),
-       [dst_v] "+r" (dst_v)
-     :
-     : "t0", "t1", "t2", "t3",
-     "t4", "t5", "t6", "t7", "t8", "t9"
-  );
-}
-
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
-  __asm__ __volatile__ (
-    ".set push                             \n"
-    ".set noreorder                        \n"
-
-    "srl       $t4, %[width], 4            \n"  // multiplies of 16
-    "andi      $t5, %[width], 0xf          \n"
-    "blez      $t4, 2f                     \n"
-    " addu     %[src], %[src], %[width]    \n"  // src += width
-
-   "1:                                     \n"
-    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
-    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
-    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
-    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
-    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
-    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
-    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
-    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
-    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
-    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
-    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
-    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
-    "addiu     %[src], %[src], -16         \n"
-    "addiu     $t4, $t4, -1                \n"
-    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
-    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
-    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
-    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
-    "bgtz      $t4, 1b                     \n"
-    " addiu    %[dst], %[dst], 16          \n"
-    "beqz      $t5, 3f                     \n"
-    " nop                                  \n"
-
-   "2:                                     \n"
-    "lbu       $t0, -1(%[src])             \n"
-    "addiu     $t5, $t5, -1                \n"
-    "addiu     %[src], %[src], -1          \n"
-    "sb        $t0, 0(%[dst])              \n"
-    "bgez      $t5, 2b                     \n"
-    " addiu    %[dst], %[dst], 1           \n"
-
-   "3:                                     \n"
-    ".set pop                              \n"
-      : [src] "+r" (src), [dst] "+r" (dst)
-      : [width] "r" (width)
-      : "t0", "t1", "t2", "t3", "t4", "t5"
-  );
-}
-
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                       int width) {
-  int x;
-  int y;
-  __asm__ __volatile__ (
-    ".set push                                    \n"
-    ".set noreorder                               \n"
-
-    "addu            $t4, %[width], %[width]      \n"
-    "srl             %[x], %[width], 4            \n"
-    "andi            %[y], %[width], 0xf          \n"
-    "blez            %[x], 2f                     \n"
-    " addu           %[src_uv], %[src_uv], $t4    \n"
-
-   "1:                                            \n"
-    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
-    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
-    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
-    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
-    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
-    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
-    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
-    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
-
-    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
-    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
-    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
-    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
-    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
-    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
-    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
-    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
-    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
-    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
-    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
-    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
-    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
-    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
-    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
-    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
-    "addiu           %[src_uv], %[src_uv], -32    \n"
-    "addiu           %[x], %[x], -1               \n"
-    "swr             $t4, 0(%[dst_u])             \n"
-    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
-    "swr             $t6, 0(%[dst_v])             \n"
-    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
-    "swr             $t2, 4(%[dst_u])             \n"
-    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
-    "swr             $t3, 4(%[dst_v])             \n"
-    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
-    "swr             $t0, 8(%[dst_u])             \n"
-    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
-    "swr             $t1, 8(%[dst_v])             \n"
-    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
-    "swr             $t9, 12(%[dst_u])            \n"
-    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
-    "swr             $t5, 12(%[dst_v])            \n"
-    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
-    "addiu           %[dst_v], %[dst_v], 16       \n"
-    "bgtz            %[x], 1b                     \n"
-    " addiu          %[dst_u], %[dst_u], 16       \n"
-    "beqz            %[y], 3f                     \n"
-    " nop                                         \n"
-    "b               2f                           \n"
-    " nop                                         \n"
-
-   "2:                                            \n"
-    "lbu             $t0, -2(%[src_uv])           \n"
-    "lbu             $t1, -1(%[src_uv])           \n"
-    "addiu           %[src_uv], %[src_uv], -2     \n"
-    "addiu           %[y], %[y], -1               \n"
-    "sb              $t0, 0(%[dst_u])             \n"
-    "sb              $t1, 0(%[dst_v])             \n"
-    "addiu           %[dst_u], %[dst_u], 1        \n"
-    "bgtz            %[y], 2b                     \n"
-    " addiu          %[dst_v], %[dst_v], 1        \n"
-
-   "3:                                            \n"
-    ".set pop                                     \n"
-      : [src_uv] "+r" (src_uv),
-        [dst_u] "+r" (dst_u),
-        [dst_v] "+r" (dst_v),
-        [x] "=&r" (x),
-        [y] "=&r" (y)
-      : [width] "r" (width)
-      : "t0", "t1", "t2", "t3", "t4",
-      "t5", "t7", "t8", "t9"
-  );
-}
-
-// Convert (4 Y and 2 VU) I422 and arrange RGB values into
-// t5 = | 0 | B0 | 0 | b0 |
-// t4 = | 0 | B1 | 0 | b1 |
-// t9 = | 0 | G0 | 0 | g0 |
-// t8 = | 0 | G1 | 0 | g1 |
-// t2 = | 0 | R0 | 0 | r0 |
-// t1 = | 0 | R1 | 0 | r1 |
-#define YUVTORGB                                                               \
-      "lw                $t0, 0(%[y_buf])       \n"                            \
-      "lhu               $t1, 0(%[u_buf])       \n"                            \
-      "lhu               $t2, 0(%[v_buf])       \n"                            \
-      "preceu.ph.qbr     $t1, $t1               \n"                            \
-      "preceu.ph.qbr     $t2, $t2               \n"                            \
-      "preceu.ph.qbra    $t3, $t0               \n"                            \
-      "preceu.ph.qbla    $t0, $t0               \n"                            \
-      "subu.ph           $t1, $t1, $s5          \n"                            \
-      "subu.ph           $t2, $t2, $s5          \n"                            \
-      "subu.ph           $t3, $t3, $s4          \n"                            \
-      "subu.ph           $t0, $t0, $s4          \n"                            \
-      "mul.ph            $t3, $t3, $s0          \n"                            \
-      "mul.ph            $t0, $t0, $s0          \n"                            \
-      "shll.ph           $t4, $t1, 0x7          \n"                            \
-      "subu.ph           $t4, $t4, $t1          \n"                            \
-      "mul.ph            $t6, $t1, $s1          \n"                            \
-      "mul.ph            $t1, $t2, $s2          \n"                            \
-      "addq_s.ph         $t5, $t4, $t3          \n"                            \
-      "addq_s.ph         $t4, $t4, $t0          \n"                            \
-      "shra.ph           $t5, $t5, 6            \n"                            \
-      "shra.ph           $t4, $t4, 6            \n"                            \
-      "addiu             %[u_buf], 2            \n"                            \
-      "addiu             %[v_buf], 2            \n"                            \
-      "addu.ph           $t6, $t6, $t1          \n"                            \
-      "mul.ph            $t1, $t2, $s3          \n"                            \
-      "addu.ph           $t9, $t6, $t3          \n"                            \
-      "addu.ph           $t8, $t6, $t0          \n"                            \
-      "shra.ph           $t9, $t9, 6            \n"                            \
-      "shra.ph           $t8, $t8, 6            \n"                            \
-      "addu.ph           $t2, $t1, $t3          \n"                            \
-      "addu.ph           $t1, $t1, $t0          \n"                            \
-      "shra.ph           $t2, $t2, 6            \n"                            \
-      "shra.ph           $t1, $t1, 6            \n"                            \
-      "subu.ph           $t5, $t5, $s5          \n"                            \
-      "subu.ph           $t4, $t4, $s5          \n"                            \
-      "subu.ph           $t9, $t9, $s5          \n"                            \
-      "subu.ph           $t8, $t8, $s5          \n"                            \
-      "subu.ph           $t2, $t2, $s5          \n"                            \
-      "subu.ph           $t1, $t1, $s5          \n"                            \
-      "shll_s.ph         $t5, $t5, 8            \n"                            \
-      "shll_s.ph         $t4, $t4, 8            \n"                            \
-      "shll_s.ph         $t9, $t9, 8            \n"                            \
-      "shll_s.ph         $t8, $t8, 8            \n"                            \
-      "shll_s.ph         $t2, $t2, 8            \n"                            \
-      "shll_s.ph         $t1, $t1, 8            \n"                            \
-      "shra.ph           $t5, $t5, 8            \n"                            \
-      "shra.ph           $t4, $t4, 8            \n"                            \
-      "shra.ph           $t9, $t9, 8            \n"                            \
-      "shra.ph           $t8, $t8, 8            \n"                            \
-      "shra.ph           $t2, $t2, 8            \n"                            \
-      "shra.ph           $t1, $t1, 8            \n"                            \
-      "addu.ph           $t5, $t5, $s5          \n"                            \
-      "addu.ph           $t4, $t4, $s5          \n"                            \
-      "addu.ph           $t9, $t9, $s5          \n"                            \
-      "addu.ph           $t8, $t8, $s5          \n"                            \
-      "addu.ph           $t2, $t2, $s5          \n"                            \
-      "addu.ph           $t1, $t1, $s5          \n"
-
-// TODO(fbarchard): accept yuv conversion constants.
-void I422ToARGBRow_DSPR2(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm__ __volatile__ (
-    ".set push                                \n"
-    ".set noreorder                           \n"
-    "beqz              %[width], 2f           \n"
-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
-    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
-    "lui               $s6, 0xff00            \n"
-    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
-
-   "1:                                        \n"
-      YUVTORGB
-// Arranging into argb format
-    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
-    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
-    "addiu             %[width], -4           \n"
-    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
-    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
-    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
-
-    "addiu             %[y_buf], 4            \n"
-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
-    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
-    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
-    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
-    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
-    "sll               $t9, $t9, 16           \n"
-    "sll               $t8, $t8, 16           \n"
-    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
-    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
-// Store results.
-    "sw                $t2, 0(%[rgb_buf])     \n"
-    "sw                $t0, 4(%[rgb_buf])     \n"
-    "sw                $t1, 8(%[rgb_buf])     \n"
-    "sw                $t3, 12(%[rgb_buf])    \n"
-    "bnez              %[width], 1b           \n"
-    " addiu            %[rgb_buf], 16         \n"
-   "2:                                        \n"
-    ".set pop                                 \n"
-      :[y_buf] "+r" (y_buf),
-       [u_buf] "+r" (u_buf),
-       [v_buf] "+r" (v_buf),
-       [width] "+r" (width),
-       [rgb_buf] "+r" (rgb_buf)
-      :
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-      "t6", "t7", "t8", "t9",
-      "s0", "s1", "s2", "s3",
-      "s4", "s5", "s6"
-  );
-}
-
-// Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
-    int y0_fraction = 256 - source_y_fraction;
-    const uint8* src_ptr1 = src_ptr + src_stride;
-
-  __asm__ __volatile__ (
-     ".set push                                           \n"
-     ".set noreorder                                      \n"
-
-     "replv.ph          $t0, %[y0_fraction]               \n"
-     "replv.ph          $t1, %[source_y_fraction]         \n"
-
-   "1:                                                    \n"
-     "lw                $t2, 0(%[src_ptr])                \n"
-     "lw                $t3, 0(%[src_ptr1])               \n"
-     "lw                $t4, 4(%[src_ptr])                \n"
-     "lw                $t5, 4(%[src_ptr1])               \n"
-     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
-     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
-     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
-     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
-     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
-     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
-     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
-     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
-     "addq.ph           $t6, $t6, $t8                     \n"
-     "addq.ph           $t7, $t7, $t9                     \n"
-     "addq.ph           $t2, $t2, $t4                     \n"
-     "addq.ph           $t3, $t3, $t5                     \n"
-     "shra.ph           $t6, $t6, 8                       \n"
-     "shra.ph           $t7, $t7, 8                       \n"
-     "shra.ph           $t2, $t2, 8                       \n"
-     "shra.ph           $t3, $t3, 8                       \n"
-     "precr.qb.ph       $t6, $t6, $t7                     \n"
-     "precr.qb.ph       $t2, $t2, $t3                     \n"
-     "addiu             %[src_ptr], %[src_ptr], 8         \n"
-     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
-     "addiu             %[dst_width], %[dst_width], -8    \n"
-     "sw                $t6, 0(%[dst_ptr])                \n"
-     "sw                $t2, 4(%[dst_ptr])                \n"
-     "bgtz              %[dst_width], 1b                  \n"
-     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
-
-     ".set pop                                            \n"
-  : [dst_ptr] "+r" (dst_ptr),
-    [src_ptr1] "+r" (src_ptr1),
-    [src_ptr] "+r" (src_ptr),
-    [dst_width] "+r" (dst_width)
-  : [source_y_fraction] "r" (source_y_fraction),
-    [y0_fraction] "r" (y0_fraction),
-    [src_stride] "r" (src_stride)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-#endif  // __mips_dsp_rev >= 2
-
-#endif  // defined(__mips__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif

diff --git a/files/source/row_msa.cc b/files/source/row_msa.cc
new file mode 100644
index 0000000..f79de1c
--- /dev/null
+++ b/files/source/row_msa.cc

@@ -0,0 +1,2977 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ALPHA_VAL (-1)
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
+  {                                                              \
+    ub = __msa_fill_w(yuvconst->kUVToB[0]);                      \
+    vr = __msa_fill_w(yuvconst->kUVToR[1]);                      \
+    ug = __msa_fill_w(yuvconst->kUVToG[0]);                      \
+    vg = __msa_fill_w(yuvconst->kUVToG[1]);                      \
+    bb = __msa_fill_w(yuvconst->kUVBiasB[0]);                    \
+    bg = __msa_fill_w(yuvconst->kUVBiasG[0]);                    \
+    br = __msa_fill_w(yuvconst->kUVBiasR[0]);                    \
+    yg = __msa_fill_w(yuvconst->kYToRgb[0]);                     \
+  }
+
+// Load YUV 422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)  \
+  {                                                              \
+    uint64 y_m;                                                  \
+    uint32 u_m, v_m;                                             \
+    v4i32 zero_m = {0};                                          \
+    y_m = LD(psrc_y);                                            \
+    u_m = LW(psrc_u);                                            \
+    v_m = LW(psrc_v);                                            \
+    out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \
+    out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m);        \
+    out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m);        \
+  }
+
+// Clip input vector elements between 0 to 255
+#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
+  {                                               \
+    v4i32 max_m = __msa_ldi_w(0xFF);              \
+                                                  \
+    in0 = __msa_maxi_s_w(in0, 0);                 \
+    in1 = __msa_maxi_s_w(in1, 0);                 \
+    in2 = __msa_maxi_s_w(in2, 0);                 \
+    in3 = __msa_maxi_s_w(in3, 0);                 \
+    in4 = __msa_maxi_s_w(in4, 0);                 \
+    in5 = __msa_maxi_s_w(in5, 0);                 \
+    in0 = __msa_min_s_w(max_m, in0);              \
+    in1 = __msa_min_s_w(max_m, in1);              \
+    in2 = __msa_min_s_w(max_m, in2);              \
+    in3 = __msa_min_s_w(max_m, in3);              \
+    in4 = __msa_min_s_w(max_m, in4);              \
+    in5 = __msa_min_s_w(max_m, in5);              \
+  }
+
+// Convert 8 pixels of YUV 420 to RGB.
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
+  {                                                                            \
+    v8i16 vec0_m, vec1_m;                                                      \
+    v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                              \
+    v4i32 reg5_m, reg6_m, reg7_m;                                              \
+    v16i8 zero_m = {0};                                                        \
+                                                                               \
+    vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);                    \
+    vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);                 \
+    reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);                \
+    reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);                \
+    reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m);                \
+    reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m);                \
+    reg0_m *= yg;                                                              \
+    reg1_m *= yg;                                                              \
+    reg2_m *= ubvr;                                                            \
+    reg3_m *= ubvr;                                                            \
+    reg0_m = __msa_srai_w(reg0_m, 16);                                         \
+    reg1_m = __msa_srai_w(reg1_m, 16);                                         \
+    reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);                       \
+    reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                                    \
+    reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                                    \
+    reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                                     \
+    reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                                    \
+    reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                                    \
+    reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                                     \
+    reg5_m = reg0_m - reg5_m;                                                  \
+    reg6_m = reg1_m - reg6_m;                                                  \
+    reg2_m = reg0_m - reg2_m;                                                  \
+    reg3_m = reg1_m - reg3_m;                                                  \
+    reg7_m = reg0_m - reg7_m;                                                  \
+    reg4_m = reg1_m - reg4_m;                                                  \
+    reg5_m += bb;                                                              \
+    reg6_m += bb;                                                              \
+    reg7_m += bg;                                                              \
+    reg4_m += bg;                                                              \
+    reg2_m += br;                                                              \
+    reg3_m += br;                                                              \
+    reg5_m = __msa_srai_w(reg5_m, 6);                                          \
+    reg6_m = __msa_srai_w(reg6_m, 6);                                          \
+    reg7_m = __msa_srai_w(reg7_m, 6);                                          \
+    reg4_m = __msa_srai_w(reg4_m, 6);                                          \
+    reg2_m = __msa_srai_w(reg2_m, 6);                                          \
+    reg3_m = __msa_srai_w(reg3_m, 6);                                          \
+    CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);               \
+    out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);                       \
+    out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);                       \
+    out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                       \
+  }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in0, in1, in2, in3, pdst_argb)           \
+  {                                                        \
+    v8i16 vec0_m, vec1_m;                                  \
+    v16u8 dst0_m, dst1_m;                                  \
+    vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+    vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+    dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m);          \
+    dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m);          \
+    ST_UB2(dst0_m, dst1_m, pdst_argb, 16);                 \
+  }
+
+// Takes ARGB input and calculates Y.
+#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
+                y_out)                                                     \
+  {                                                                        \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m;                                  \
+    v8u16 reg0_m, reg1_m;                                                  \
+                                                                           \
+    vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0);             \
+    vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2);             \
+    vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0);             \
+    vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2);             \
+    reg0_m = __msa_dotp_u_h(vec0_m, const0);                               \
+    reg1_m = __msa_dotp_u_h(vec1_m, const0);                               \
+    reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1);                      \
+    reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1);                      \
+    reg0_m += const2;                                                      \
+    reg1_m += const2;                                                      \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift);                    \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift);                    \
+    y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);            \
+  }
+
+// Loads current and next row of ARGB input and averages it to calculate U and V
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3)               \
+  {                                                                       \
+    v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+    v16u8 vec8_m, vec9_m;                                                 \
+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
+    v8u16 reg8_m, reg9_m;                                                 \
+                                                                          \
+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0);                             \
+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16);                            \
+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32);                            \
+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48);                            \
+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0);                             \
+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16);                            \
+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32);                            \
+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48);                            \
+    vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
+    vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
+    reg0_m = __msa_hadd_u_h(vec0_m, vec0_m);                              \
+    reg1_m = __msa_hadd_u_h(vec1_m, vec1_m);                              \
+    reg2_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
+    reg3_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
+    reg4_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
+    reg5_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
+    reg6_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
+    reg7_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
+    argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
+    argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64);                            \
+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80);                            \
+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96);                            \
+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112);                           \
+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64);                            \
+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80);                            \
+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96);                            \
+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112);                           \
+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
+    reg0_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
+    reg1_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
+    reg2_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
+    reg3_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
+    reg4_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
+    reg5_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
+    reg6_m = __msa_hadd_u_h(vec8_m, vec8_m);                              \
+    reg7_m = __msa_hadd_u_h(vec9_m, vec9_m);                              \
+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
+    argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
+    argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+  }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+                 shf0, shf1, shf2, shf3, v_out, u_out)                       \
+  {                                                                          \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
+                                                                             \
+    vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0);          \
+    vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2);          \
+    vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0);          \
+    vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2);          \
+    vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0);          \
+    vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2);          \
+    vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0);          \
+    vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2);          \
+    reg0_m = __msa_dotp_u_h(vec0_m, const1);                                 \
+    reg1_m = __msa_dotp_u_h(vec1_m, const1);                                 \
+    reg2_m = __msa_dotp_u_h(vec4_m, const1);                                 \
+    reg3_m = __msa_dotp_u_h(vec5_m, const1);                                 \
+    reg0_m += const3;                                                        \
+    reg1_m += const3;                                                        \
+    reg2_m += const3;                                                        \
+    reg3_m += const3;                                                        \
+    reg0_m -= __msa_dotp_u_h(vec2_m, const0);                                \
+    reg1_m -= __msa_dotp_u_h(vec3_m, const0);                                \
+    reg2_m -= __msa_dotp_u_h(vec6_m, const2);                                \
+    reg3_m -= __msa_dotp_u_h(vec7_m, const2);                                \
+    v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m);              \
+    u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m);              \
+  }
+
+// Load I444 pixel data
+#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
+  {                                                           \
+    uint64 y_m, u_m, v_m;                                     \
+    v2i64 zero_m = {0};                                       \
+    y_m = LD(psrc_y);                                         \
+    u_m = LD(psrc_u);                                         \
+    v_m = LD(psrc_v);                                         \
+    out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m);     \
+    out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m);     \
+    out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m);     \
+  }
+
+void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  src += width - 64;
+
+  for (x = 0; x < width; x += 64) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+  src += width * 4 - 64;
+
+  for (x = 0; x < width; x += 16) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void I422ToYUY2Row_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_yuy2,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
+    ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
+    ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_yuy2 += 64;
+  }
+}
+
+void I422ToUYVYRow_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_uyvy,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
+    ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
+    ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_uyvy += 64;
+  }
+}
+
+void I422ToARGBRow_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    rgb_buf += 32;
+  }
+}
+
+void I422ToRGBARow_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(alpha, vec0, vec1, vec2, rgb_buf);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    rgb_buf += 32;
+  }
+}
+
+void I422AlphaToARGBRow_MSA(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            const uint8* src_a,
+                            uint8* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  int x;
+  int64 data_a;
+  v16u8 src0, src1, src2, src3;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v4i32 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    data_a = LD(src_a);
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
+    STOREARGB(vec0, vec1, vec2, src3, rgb_buf);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    src_a += 8;
+    rgb_buf += 32;
+  }
+}
+
+void I422ToRGB24Row_MSA(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int32 width) {
+  int x;
+  int64 data_u, data_v;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 reg0, reg1, reg2, reg3;
+  v2i64 zero = {0};
+  v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
+  v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
+  v16i8 shuffler2 = {26, 6,  7,  27, 8,  9,  28, 10,
+                     11, 29, 12, 13, 30, 14, 15, 31};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
+    data_u = LD(src_u);
+    data_v = LD(src_v);
+    src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
+    src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
+    src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec3, vec4, vec5);
+    reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
+    reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
+    reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
+    ST_UB2(dst0, dst1, rgb_buf, 16);
+    ST_UB(dst2, (rgb_buf + 32));
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+    rgb_buf += 48;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_MSA(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec2, vec1);
+    vec0 = __msa_srai_h(vec0, 3);
+    vec1 = __msa_srai_h(vec1, 3);
+    vec2 = __msa_srai_h(vec2, 2);
+    vec1 = __msa_slli_h(vec1, 11);
+    vec2 = __msa_slli_h(vec2, 5);
+    vec0 |= vec1;
+    dst0 = (v16u8)(vec2 | vec0);
+    ST_UB(dst0, dst_rgb565);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_rgb565 += 16;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_MSA(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v8u16 reg0, reg1, reg2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srai_h(vec0, 4);
+    reg1 = (v8u16)__msa_srai_h(vec1, 4);
+    reg2 = (v8u16)__msa_srai_h(vec2, 4);
+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
+    reg1 |= const_0xF000;
+    reg0 |= reg2;
+    dst0 = (v16u8)(reg1 | reg0);
+    ST_UB(dst0, dst_argb4444);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb4444 += 16;
+  }
+}
+
+void I422ToARGB1555Row_MSA(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v8u16 reg0, reg1, reg2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srai_h(vec0, 3);
+    reg1 = (v8u16)__msa_srai_h(vec1, 3);
+    reg2 = (v8u16)__msa_srai_h(vec2, 3);
+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
+    reg1 |= const_0x8000;
+    reg0 |= reg2;
+    dst0 = (v16u8)(reg1 | reg0);
+    ST_UB(dst0, dst_argb1555);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb1555 += 16;
+  }
+}
+
+void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_yuy2 += 64;
+    dst_y += 32;
+  }
+}
+
+void YUY2ToUVRow_MSA(const uint8* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8* dst_u,
+                     uint8* dst_v,
+                     int width) {
+  const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    src_yuy2_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
+                        uint8* dst_u,
+                        uint8* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_uyvy += 64;
+    dst_y += 32;
+  }
+}
+
+void UYVYToUVRow_MSA(const uint8* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8* dst_u,
+                     uint8* dst_v,
+                     int width) {
+  const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    src_uyvy_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToUV422Row_MSA(const uint8* src_uyvy,
+                        uint8* dst_u,
+                        uint8* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16i8 zero = {0};
+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
+    reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
+    reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
+    reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
+    reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
+    reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
+    reg0 *= const_0x19;
+    reg1 *= const_0x19;
+    reg2 *= const_0x81;
+    reg3 *= const_0x81;
+    reg4 *= const_0x42;
+    reg5 *= const_0x42;
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 += reg4;
+    reg1 += reg5;
+    reg0 += const_0x1080;
+    reg1 += const_0x1080;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ARGBToUVRow_MSA(const uint8* src_argb0,
+                     int src_stride_argb,
+                     uint8* dst_u,
+                     uint8* dst_v,
+                     int width) {
+  int x;
+  const uint8* src_argb0_next = src_argb0 + src_stride_argb;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+  v16u8 dst0, dst1;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+    reg0 = __msa_hadd_u_h(vec8, vec8);
+    reg1 = __msa_hadd_u_h(vec9, vec9);
+    reg2 = __msa_hadd_u_h(vec4, vec4);
+    reg3 = __msa_hadd_u_h(vec5, vec5);
+    reg4 = __msa_hadd_u_h(vec0, vec0);
+    reg5 = __msa_hadd_u_h(vec1, vec1);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+    reg0 += __msa_hadd_u_h(vec8, vec8);
+    reg1 += __msa_hadd_u_h(vec9, vec9);
+    reg2 += __msa_hadd_u_h(vec4, vec4);
+    reg3 += __msa_hadd_u_h(vec5, vec5);
+    reg4 += __msa_hadd_u_h(vec0, vec0);
+    reg5 += __msa_hadd_u_h(vec1, vec1);
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
+    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+    reg6 = reg0 * const_0x70;
+    reg7 = reg1 * const_0x70;
+    reg8 = reg2 * const_0x4A;
+    reg9 = reg3 * const_0x4A;
+    reg6 += const_0x8080;
+    reg7 += const_0x8080;
+    reg8 += reg4 * const_0x26;
+    reg9 += reg5 * const_0x26;
+    reg0 *= const_0x12;
+    reg1 *= const_0x12;
+    reg2 *= const_0x5E;
+    reg3 *= const_0x5E;
+    reg4 *= const_0x70;
+    reg5 *= const_0x70;
+    reg2 += reg0;
+    reg3 += reg1;
+    reg4 += const_0x8080;
+    reg5 += const_0x8080;
+    reg6 -= reg8;
+    reg7 -= reg9;
+    reg4 -= reg2;
+    reg5 -= reg3;
+    reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
+    reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_argb0 += 128;
+    src_argb0_next += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+  v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
+  v16i8 shuffler1 = {5,  6,  8,  9,  10, 12, 13, 14,
+                     16, 17, 18, 20, 21, 22, 24, 25};
+  v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
+                     21, 22, 24, 25, 26, 28, 29, 30};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_rgb, 16);
+    ST_UB(dst2, (dst_rgb + 32));
+    src_argb += 64;
+    dst_rgb += 48;
+  }
+}
+
+void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+  v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
+  v16i8 shuffler1 = {5,  4,  10, 9,  8,  14, 13, 12,
+                     18, 17, 16, 22, 21, 20, 26, 25};
+  v16i8 shuffler2 = {8,  14, 13, 12, 18, 17, 16, 22,
+                     21, 20, 26, 25, 24, 30, 29, 28};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_rgb, 16);
+    ST_UB(dst2, (dst_rgb + 32));
+    src_argb += 64;
+    dst_rgb += 48;
+  }
+}
+
+void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
+    vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
+    vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+    vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
+    vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+    vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
+    vec0 = __msa_binsli_b(vec0, vec1, 2);
+    vec1 = __msa_binsli_b(vec2, vec3, 4);
+    vec4 = __msa_binsli_b(vec4, vec5, 2);
+    vec5 = __msa_binsli_b(vec6, vec7, 4);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
+    vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+    vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
+    vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+    vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
+    vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
+    vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
+    vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
+    vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
+    vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
+    vec0 = __msa_binsli_b(vec0, vec1, 2);
+    vec5 = __msa_binsli_b(vec5, vec6, 2);
+    vec1 = __msa_binsli_b(vec2, vec3, 5);
+    vec6 = __msa_binsli_b(vec7, vec8, 5);
+    vec1 = __msa_binsli_b(vec1, vec4, 0);
+    vec6 = __msa_binsli_b(vec6, vec9, 0);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1;
+  v16u8 vec0, vec1;
+  v16u8 dst0;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
+    vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
+    src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
+    src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
+    vec0 = __msa_binsli_b(vec0, src0, 3);
+    vec1 = __msa_binsli_b(vec1, src1, 3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToUV444Row_MSA(const uint8* src_argb,
+                        uint8* dst_u,
+                        uint8* dst_v,
+                        int32 width) {
+  int32 x;
+  v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 vec8, vec9, vec10, vec11;
+  v8u16 const_112 = (v8u16)__msa_ldi_h(112);
+  v8u16 const_74 = (v8u16)__msa_ldi_h(74);
+  v8u16 const_38 = (v8u16)__msa_ldi_h(38);
+  v8u16 const_94 = (v8u16)__msa_ldi_h(94);
+  v8u16 const_18 = (v8u16)__msa_ldi_h(18);
+  v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
+  v16i8 zero = {0};
+
+  for (x = width; x > 0; x -= 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+    reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
+    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec10 = vec0 * const_18;
+    vec11 = vec1 * const_18;
+    vec8 = vec2 * const_94;
+    vec9 = vec3 * const_94;
+    vec6 = vec4 * const_112;
+    vec7 = vec5 * const_112;
+    vec0 *= const_112;
+    vec1 *= const_112;
+    vec2 *= const_74;
+    vec3 *= const_74;
+    vec4 *= const_38;
+    vec5 *= const_38;
+    vec8 += vec10;
+    vec9 += vec11;
+    vec6 += const_32896;
+    vec7 += const_32896;
+    vec0 += const_32896;
+    vec1 += const_32896;
+    vec2 += vec4;
+    vec3 += vec5;
+    vec0 -= vec2;
+    vec1 -= vec3;
+    vec6 -= vec8;
+    vec7 -= vec9;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
+    vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_argb += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBMultiplyRow_MSA(const uint8* src_argb0,
+                         const uint8* src_argb1,
+                         uint8* dst_argb,
+                         int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v4u32 reg0, reg1, reg2, reg3;
+  v8i16 zero = {0};
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_argb);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAddRow_MSA(const uint8* src_argb0,
+                    const uint8* src_argb1,
+                    uint8* dst_argb,
+                    int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+    dst0 = __msa_adds_u_b(src0, src2);
+    dst1 = __msa_adds_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSubtractRow_MSA(const uint8* src_argb0,
+                         const uint8* src_argb1,
+                         uint8* dst_argb,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+    dst0 = __msa_subs_u_b(src0, src2);
+    dst1 = __msa_subs_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 zero = {0};
+  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
+    vec4 = (v8u16)__msa_fill_h(vec0[3]);
+    vec5 = (v8u16)__msa_fill_h(vec0[7]);
+    vec6 = (v8u16)__msa_fill_h(vec1[3]);
+    vec7 = (v8u16)__msa_fill_h(vec1[7]);
+    vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    vec6 = (v8u16)__msa_fill_h(vec2[3]);
+    vec7 = (v8u16)__msa_fill_h(vec2[7]);
+    vec8 = (v8u16)__msa_fill_h(vec3[3]);
+    vec9 = (v8u16)__msa_fill_h(vec3[7]);
+    vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
+    reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
+    reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
+    reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
+    reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+    reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+    reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+    reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+    reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
+    reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
+    reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
+    reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst0 = __msa_bmnz_v(dst0, src0, mask);
+    dst1 = __msa_bmnz_v(dst1, src1, mask);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
+                               uint8* dst_rgb,
+                               uint32 dither4,
+                               int width) {
+  int x;
+  v16u8 src0, src1, dst0, vec0, vec1;
+  v8i16 vec_d0;
+  v8i16 reg0, reg1, reg2;
+  v16i8 zero = {0};
+  v8i16 max = __msa_ldi_h(0xFF);
+
+  vec_d0 = (v8i16)__msa_fill_w(dither4);
+  vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
+    reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
+    reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
+    reg0 += vec_d0;
+    reg1 += vec_d0;
+    reg2 += vec_d0;
+    reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
+    reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
+    reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
+    reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
+    reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
+    reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
+    reg0 = __msa_srai_h(reg0, 3);
+    reg2 = __msa_srai_h(reg2, 3);
+    reg1 = __msa_srai_h(reg1, 2);
+    reg2 = __msa_slli_h(reg2, 11);
+    reg1 = __msa_slli_h(reg1, 5);
+    reg0 |= reg1;
+    dst0 = (v16u8)(reg0 | reg2);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBShuffleRow_MSA(const uint8* src_argb,
+                        uint8* dst_argb,
+                        const uint8* shuffler,
+                        int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+  v16i8 vec0;
+  v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+  int32 val = LW((int32*)shuffler);
+
+  vec0 = (v16i8)__msa_fill_w(val);
+  shuffler_vec += vec0;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+    dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBShadeRow_MSA(const uint8* src_argb,
+                      uint8* dst_argb,
+                      int width,
+                      uint32 value) {
+  int x;
+  v16u8 src0, dst0;
+  v8u16 vec0, vec1;
+  v4u32 reg0, reg1, reg2, reg3, rgba_scale;
+  v8i16 zero = {0};
+
+  rgba_scale[0] = value;
+  rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
+  rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg0 *= rgba_scale;
+    reg1 *= rgba_scale;
+    reg2 *= rgba_scale;
+    reg3 *= rgba_scale;
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_argb);
+    src_argb += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, dst0, dst1;
+  v8u16 reg0;
+  v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+    reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
+    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
+    vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSepiaRow_MSA(uint8* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2;
+  v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
+  v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
+  v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
+  v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
+  v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
+  v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
+  v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
+    reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
+    reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
+    reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
+    reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
+    reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
+    reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
+    reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
+    reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
+    vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
+void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
+                           uint8* dst_argb,
+                           int width) {
+  int x;
+  v16u8 src0, src1;
+  v8u16 vec0, vec1, vec2, vec3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
+    vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
+    vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
+    vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
+    vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
+    vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
+    vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
+    vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
+    vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_argb4444 += 32;
+    dst_argb += 64;
+  }
+}
+
+void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
+                           uint8* dst_argb,
+                           int width) {
+  int x;
+  v8u16 src0, src1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
+  v16u8 dst0, dst1, dst2, dst3;
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0);
+    src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
+    reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
+    reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
+    reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
+    reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
+    reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
+    reg3 = -reg3;
+    reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
+    reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
+    reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
+    reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_argb1555 += 32;
+    dst_argb += 64;
+  }
+}
+
+void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0);
+    src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src0 & const_0x7E0;
+    vec2 = src0 & const_0xF800;
+    vec3 = src1 & const_0x1F;
+    vec4 = src1 & const_0x7E0;
+    vec5 = src1 & const_0xF800;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
+    res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
+    res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_rgb565 += 32;
+    dst_argb += 64;
+  }
+}
+
+void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v16u8 vec0, vec1, vec2;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32);
+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+    dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
+    dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_rgb24 += 48;
+    dst_argb += 64;
+  }
+}
+
+void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v16u8 vec0, vec1, vec2;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
+    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_raw += 48;
+    dst_argb += 64;
+  }
+}
+
+void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16u8 dst0;
+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
+    reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
+    reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
+    reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
+    reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
+    reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
+    reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
+    reg0 *= const_0x19;
+    reg1 *= const_0x19;
+    reg2 *= const_0x81;
+    reg3 *= const_0x81;
+    reg4 *= const_0x42;
+    reg5 *= const_0x42;
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 += reg4;
+    reg1 += reg5;
+    reg0 += const_0x1080;
+    reg1 += const_0x1080;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_y);
+    src_argb1555 += 32;
+    dst_y += 16;
+  }
+}
+
+void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v4u32 res0, res1, res2, res3;
+  v16u8 dst0;
+  v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
+  v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
+  v8i16 const_0x1080 = __msa_fill_h(0x1080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src0 & const_0x7E0;
+    vec2 = src0 & const_0xF800;
+    vec3 = src1 & const_0x1F;
+    vec4 = src1 & const_0x7E0;
+    vec5 = src1 & const_0xF800;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+    vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
+    vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
+    vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
+    vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
+    vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
+    vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
+    vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
+    res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
+    res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
+    res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
+    res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
+    res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
+    res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
+    res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
+    res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
+    res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
+    res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
+    res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
+    res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_rgb565 += 32;
+    dst_y += 16;
+  }
+}
+
+void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
+  v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+                 18, 19, 20, 21, 21, 22, 23, 24};
+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
+    vec0 += const_0x1080;
+    vec1 += const_0x1080;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 48;
+    dst_y += 16;
+  }
+}
+
+void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
+  v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+                 18, 19, 20, 21, 21, 22, 23, 24};
+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
+    vec0 += const_0x1080;
+    vec1 += const_0x1080;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 48;
+    dst_y += 16;
+  }
+}
+
+void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8* dst_u,
+                         uint8* dst_v,
+                         int width) {
+  int x;
+  const uint16* s = (const uint16*)src_argb1555;
+  const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555);
+  int64_t res0, res1;
+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    vec0 += src2 & const_0x1F;
+    vec1 += src3 & const_0x1F;
+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    vec2 += src2 & const_0x1F;
+    vec3 += src3 & const_0x1F;
+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    vec4 += src2 & const_0x1F;
+    vec5 += src3 & const_0x1F;
+    vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+    vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+    vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+    vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+    vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
+    vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
+    reg0 = vec6 * const_0x70;
+    reg1 = vec0 * const_0x4A;
+    reg2 = vec2 * const_0x70;
+    reg3 = vec0 * const_0x5E;
+    reg0 += const_0x8080;
+    reg1 += vec2 * const_0x26;
+    reg2 += const_0x8080;
+    reg3 += vec6 * const_0x12;
+    reg0 -= reg1;
+    reg2 -= reg3;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    s += 16;
+    t += 16;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RGB565ToUVRow_MSA(const uint8* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
+  int x;
+  const uint16* s = (const uint16*)src_rgb565;
+  const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565);
+  int64_t res0, res1;
+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    vec0 += src2 & const_0x1F;
+    vec1 += src3 & const_0x1F;
+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec2 = src0 & const_0x3F;
+    vec3 = src1 & const_0x3F;
+    vec2 += src2 & const_0x3F;
+    vec3 += src3 & const_0x3F;
+    vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    vec4 += src2 & const_0x1F;
+    vec5 += src3 & const_0x1F;
+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+    vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+    vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+    vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+    reg0 = vec3 * const_0x70;
+    reg1 = vec1 * const_0x4A;
+    reg2 = vec4 * const_0x70;
+    reg3 = vec1 * const_0x5E;
+    reg0 += const_32896;
+    reg1 += vec4 * const_0x26;
+    reg2 += const_32896;
+    reg3 += vec3 * const_0x12;
+    reg0 -= reg1;
+    reg2 -= reg3;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    s += 16;
+    t += 16;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RGB24ToUVRow_MSA(const uint8* src_rgb0,
+                      int src_stride_rgb,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
+  int x;
+  const uint8* s = src_rgb0;
+  const uint8* t = src_rgb0 + src_stride_rgb;
+  int64 res0, res1;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 reg0, reg1, reg2, reg3;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+    reg0 = __msa_srai_h((v8i16)reg0, 2);
+    reg1 = __msa_srai_h((v8i16)reg1, 2);
+    reg2 = __msa_srai_h((v8i16)reg2, 2);
+    reg3 = __msa_srai_h((v8i16)reg3, 2);
+    vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
+    vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
+    vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
+    vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+    vec3 = vec0 * const_0x70;
+    vec4 = vec1 * const_0x4A;
+    vec5 = vec2 * const_0x26;
+    vec2 *= const_0x70;
+    vec1 *= const_0x5E;
+    vec0 *= const_0x12;
+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+    reg0 += reg1;
+    reg2 += reg3;
+    reg0 = __msa_srai_h(reg0, 8);
+    reg2 = __msa_srai_h(reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    t += 48;
+    s += 48;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RAWToUVRow_MSA(const uint8* src_rgb0,
+                    int src_stride_rgb,
+                    uint8* dst_u,
+                    uint8* dst_v,
+                    int width) {
+  int x;
+  const uint8* s = src_rgb0;
+  const uint8* t = src_rgb0 + src_stride_rgb;
+  int64 res0, res1;
+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 reg0, reg1, reg2, reg3;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+    reg0 = __msa_srai_h(reg0, 2);
+    reg1 = __msa_srai_h(reg1, 2);
+    reg2 = __msa_srai_h(reg2, 2);
+    reg3 = __msa_srai_h(reg3, 2);
+    vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    vec3 = vec0 * const_0x70;
+    vec4 = vec1 * const_0x4A;
+    vec5 = vec2 * const_0x26;
+    vec2 *= const_0x70;
+    vec1 *= const_0x5E;
+    vec0 *= const_0x12;
+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+    reg0 += reg1;
+    reg2 += reg3;
+    reg0 = __msa_srai_h(reg0, 8);
+    reg2 = __msa_srai_h(reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    t += 48;
+    s += 48;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void NV12ToARGBRow_MSA(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  uint64 val0, val1;
+  v16u8 src0, src1, res0, res1, dst0, dst1;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 zero = {0};
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_uv);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    ST_UB2(dst0, dst1, rgb_buf, 16);
+    src_y += 8;
+    src_uv += 8;
+    rgb_buf += 32;
+  }
+}
+
+void NV12ToRGB565Row_MSA(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  uint64 val0, val1;
+  v16u8 src0, src1, dst0;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_uv);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    vec0 = vec0 >> 3;
+    vec1 = (vec1 >> 2) << 5;
+    vec2 = (vec2 >> 3) << 11;
+    dst0 = (v16u8)(vec0 | vec1 | vec2);
+    ST_UB(dst0, rgb_buf);
+    src_y += 8;
+    src_uv += 8;
+    rgb_buf += 16;
+  }
+}
+
+void NV21ToARGBRow_MSA(const uint8* src_y,
+                       const uint8* src_vu,
+                       uint8* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  uint64 val0, val1;
+  v16u8 src0, src1, res0, res1, dst0, dst1;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16u8 zero = {0};
+  v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_vu);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    ST_UB2(dst0, dst1, rgb_buf, 16);
+    src_y += 8;
+    src_vu += 8;
+    rgb_buf += 32;
+  }
+}
+
+void SobelRow_MSA(const uint8* src_sobelx,
+                  const uint8* src_sobely,
+                  uint8* dst_argb,
+                  int width) {
+  int x;
+  v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
+  v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
+  v16i8 const_0x4 = __msa_ldi_b(0x4);
+  v16i8 mask1 = mask0 + const_0x4;
+  v16i8 mask2 = mask1 + const_0x4;
+  v16i8 mask3 = mask2 + const_0x4;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+    vec0 = __msa_adds_u_b(src0, src1);
+    dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
+    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
+    dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void SobelToPlaneRow_MSA(const uint8* src_sobelx,
+                         const uint8* src_sobely,
+                         uint8* dst_y,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
+    dst0 = __msa_adds_u_b(src0, src2);
+    dst1 = __msa_adds_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_sobelx += 32;
+    src_sobely += 32;
+    dst_y += 32;
+  }
+}
+
+void SobelXYRow_MSA(const uint8* src_sobelx,
+                    const uint8* src_sobely,
+                    uint8* dst_argb,
+                    int width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, vec2;
+  v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+    vec0 = __msa_adds_u_b(src0, src1);
+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
+    reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
+    reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+  v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
+  v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
+  v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
+  v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
+  v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ARGBToUVJRow_MSA(const uint8* src_rgb0,
+                      int src_stride_rgb,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
+  int x;
+  const uint8* s = src_rgb0;
+  const uint8* t = src_rgb0 + src_stride_rgb;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, vec2, vec3;
+  v16u8 dst0, dst1;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+  v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
+  v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
+  v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    src0 = __msa_aver_u_b(src0, src4);
+    src1 = __msa_aver_u_b(src1, src5);
+    src2 = __msa_aver_u_b(src2, src6);
+    src3 = __msa_aver_u_b(src3, src7);
+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+    vec0 = __msa_aver_u_b(src4, src6);
+    vec1 = __msa_aver_u_b(src5, src7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
+    src0 = __msa_aver_u_b(src0, src4);
+    src1 = __msa_aver_u_b(src1, src5);
+    src2 = __msa_aver_u_b(src2, src6);
+    src3 = __msa_aver_u_b(src3, src7);
+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+    vec2 = __msa_aver_u_b(src4, src6);
+    vec3 = __msa_aver_u_b(src5, src7);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_v);
+    ST_UB(dst1, dst_u);
+    s += 128;
+    t += 128;
+    dst_v += 16;
+    dst_u += 16;
+  }
+}
+
+void BGRAToUVRow_MSA(const uint8* src_rgb0,
+                     int src_stride_rgb,
+                     uint8* dst_u,
+                     uint8* dst_v,
+                     int width) {
+  int x;
+  const uint8* s = src_rgb0;
+  const uint8* t = src_rgb0 + src_stride_rgb;
+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_v);
+    ST_UB(dst1, dst_u);
+    s += 128;
+    t += 128;
+    dst_v += 16;
+    dst_u += 16;
+  }
+}
+
+void ABGRToUVRow_MSA(const uint8* src_rgb0,
+                     int src_stride_rgb,
+                     uint8* dst_u,
+                     uint8* dst_v,
+                     int width) {
+  int x;
+  const uint8* s = src_rgb0;
+  const uint8* t = src_rgb0 + src_stride_rgb;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+  v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
+  v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, src0, src1, src2, src3);
+    ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    s += 128;
+    t += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void RGBAToUVRow_MSA(const uint8* src_rgb0,
+                     int src_stride_rgb,
+                     uint8* dst_u,
+                     uint8* dst_v,
+                     int width) {
+  int x;
+  const uint8* s = src_rgb0;
+  const uint8* t = src_rgb0 + src_stride_rgb;
+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    s += 128;
+    t += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void I444ToARGBRow_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0, dst1;
+  v8u16 vec0, vec1, vec2;
+  v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+
+  for (x = 0; x < width; x += 8) {
+    READI444(src_y, src_u, src_v, src0, src1, src2);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    reg0 *= vec_yg;
+    reg1 *= vec_yg;
+    reg0 = __msa_srai_w(reg0, 16);
+    reg1 = __msa_srai_w(reg1, 16);
+    reg4 = reg0 + vec_br;
+    reg5 = reg1 + vec_br;
+    reg2 = reg0 + vec_bg;
+    reg3 = reg1 + vec_bg;
+    reg0 += vec_bb;
+    reg1 += vec_bb;
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
+    reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+    reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+    reg0 -= reg6 * vec_ub;
+    reg1 -= reg7 * vec_ub;
+    reg2 -= reg6 * vec_ug;
+    reg3 -= reg7 * vec_ug;
+    reg4 -= reg8 * vec_vr;
+    reg5 -= reg9 * vec_vr;
+    reg2 -= reg8 * vec_vg;
+    reg3 -= reg9 * vec_vg;
+    reg0 = __msa_srai_w(reg0, 6);
+    reg1 = __msa_srai_w(reg1, 6);
+    reg2 = __msa_srai_w(reg2, 6);
+    reg3 = __msa_srai_w(reg3, 6);
+    reg4 = __msa_srai_w(reg4, 6);
+    reg5 = __msa_srai_w(reg5, 6);
+    CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
+    dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
+    dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
+    ST_UB2(dst0, dst1, rgb_buf, 16);
+    src_y += 8;
+    src_u += 8;
+    src_v += 8;
+    rgb_buf += 32;
+  }
+}
+
+void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) {
+  int x;
+  v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
+  v8i16 vec0, vec1;
+  v4i32 reg0, reg1, reg2, reg3;
+  v4i32 vec_yg = __msa_fill_w(0x4A35);
+  v8i16 vec_ygb = __msa_fill_h(0xFB78);
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 max = __msa_ldi_h(0xFF);
+  v8i16 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
+    reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
+    reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
+    reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
+    reg0 *= vec_yg;
+    reg1 *= vec_yg;
+    reg2 *= vec_yg;
+    reg3 *= vec_yg;
+    reg0 = __msa_srai_w(reg0, 16);
+    reg1 = __msa_srai_w(reg1, 16);
+    reg2 = __msa_srai_w(reg2, 16);
+    reg3 = __msa_srai_w(reg3, 16);
+    vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec0 += vec_ygb;
+    vec1 += vec_ygb;
+    vec0 = __msa_srai_h(vec0, 6);
+    vec1 = __msa_srai_h(vec1, 6);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
+    res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
+    res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
+    res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
+    ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16);
+    src_y += 16;
+    rgb_buf += 64;
+  }
+}
+
+void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) {
+  int x;
+  v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
+    vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_y += 16;
+    dst_argb += 64;
+  }
+}
+
+void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
+                       uint8* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+    src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+    src_yuy2 += 16;
+    rgb_buf += 32;
+  }
+}
+
+void UYVYToARGBRow_MSA(const uint8* src_uyvy,
+                       uint8* rgb_buf,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+    src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+    src_uyvy += 16;
+    rgb_buf += 32;
+  }
+}
+
+void InterpolateRow_MSA(uint8* dst_ptr,
+                        const uint8* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int32 source_y_fraction) {
+  int32 y1_fraction = source_y_fraction;
+  int32 y0_fraction = 256 - y1_fraction;
+  uint16 y_fractions;
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, y_frac;
+
+  if (0 == y1_fraction) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+
+  if (128 == y1_fraction) {
+    for (x = 0; x < width; x += 32) {
+      src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+      src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+      src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+      src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+      dst0 = __msa_aver_u_b(src0, src2);
+      dst1 = __msa_aver_u_b(src1, src3);
+      ST_UB2(dst0, dst1, dst_ptr, 16);
+      s += 32;
+      t += 32;
+      dst_ptr += 32;
+    }
+    return;
+  }
+
+  y_fractions = (uint16)(y0_fraction + (y1_fraction << 8));
+  y_frac = (v8u16)__msa_fill_h(y_fractions);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
+    vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
+    vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
+    vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    ST_UB2(dst0, dst1, dst_ptr, 16);
+    s += 32;
+    t += 32;
+    dst_ptr += 32;
+  }
+}
+
+void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
+  int x;
+  v16u8 dst0 = (v16u8)__msa_fill_w(v32);
+
+  for (x = 0; x < width; x += 4) {
+    ST_UB(dst0, dst_argb);
+    dst_argb += 16;
+  }
+}
+
+void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+  v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
+  v16i8 shuffler1 = {8,  7,  12, 11, 10, 15, 14, 13,
+                     18, 17, 16, 21, 20, 19, 24, 23};
+  v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
+                     24, 23, 28, 27, 26, 31, 30, 29};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+    src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
+    src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
+    ST_UB2(dst0, dst1, dst_rgb24, 16);
+    ST_UB(dst2, (dst_rgb24 + 32));
+    src_raw += 48;
+    dst_rgb24 += 48;
+  }
+}
+
+void MergeUVRow_MSA(const uint8* src_u,
+                    const uint8* src_v,
+                    uint8* dst_uv,
+                    int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
+    ST_UB2(dst0, dst1, dst_uv, 16);
+    src_u += 16;
+    src_v += 16;
+    dst_uv += 32;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
index 909df06..bed14e0 100644
--- a/files/source/row_neon.cc
+++ b/files/source/row_neon.cc

@@ -10,6 +10,8 @@
 
 #include "libyuv/row.h"
 
+#include <stdio.h>
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -20,29 +22,18 @@
     !defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+#define READYUV422 \
+  MEMACCESS(0)     \
+  "vld1.8     {d0}, [%0]!                    \n"                             \
     MEMACCESS(1)                                                               \
     "vld1.32    {d2[0]}, [%1]!                 \n"                             \
     MEMACCESS(2)                                                               \
     "vld1.32    {d2[1]}, [%2]!                 \n"
 
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vzip.u8    d2, d3                         \n"
-
 // Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+#define READYUV444 \
+  MEMACCESS(0)     \
+  "vld1.8     {d0}, [%0]!                    \n"                             \
     MEMACCESS(1)                                                               \
     "vld1.8     {d2}, [%1]!                    \n"                             \
     MEMACCESS(2)                                                               \
@@ -51,15 +42,15 @@
     "vrshrn.u16 d2, q1, #1                     \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    "vmov.u8    d2, #128                       \n"
+#define READYUV400                               \
+  MEMACCESS(0)                                   \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vmov.u8    d2, #128                       \n"
 
 // Read 8 Y and 4 UV from NV12
-#define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+#define READNV12 \
+  MEMACCESS(0)   \
+  "vld1.8     {d0}, [%0]!                    \n"                             \
     MEMACCESS(1)                                                               \
     "vld1.8     {d2}, [%1]!                    \n"                             \
     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
@@ -67,9 +58,9 @@
     "vtrn.u32   d2, d3                         \n"
 
 // Read 8 Y and 4 VU from NV21
-#define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+#define READNV21 \
+  MEMACCESS(0)   \
+  "vld1.8     {d0}, [%0]!                    \n"                             \
     MEMACCESS(1)                                                               \
     "vld1.8     {d2}, [%1]!                    \n"                             \
     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
@@ -77,25 +68,25 @@
     "vtrn.u32   d2, d3                         \n"
 
 // Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d0, d2}, [%0]!                \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+#define READYUY2                                 \
+  MEMACCESS(0)                                   \
+  "vld2.8     {d0, d2}, [%0]!                \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d2, d3}, [%0]!                \n"                             \
-    "vmov.u8    d0, d3                         \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+#define READUYVY                                 \
+  MEMACCESS(0)                                   \
+  "vld2.8     {d2, d3}, [%0]!                \n" \
+  "vmov.u8    d0, d3                         \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
 
-#define YUVTORGB_SETUP                                                         \
-    MEMACCESS([kUVToRB])                                                       \
-    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
+#define YUVTORGB_SETUP \
+  MEMACCESS([kUVToRB]) \
+  "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
     MEMACCESS([kUVToG])                                                        \
     "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
     MEMACCESS([kUVBiasBGR])                                                    \
@@ -107,32 +98,32 @@
     MEMACCESS([kYToRgb])                                                       \
     "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
 
-#define YUVTORGB                                                               \
-    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
-    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
-    "vmovl.u8   q0, d0                         \n" /* Y                      */\
-    "vmovl.s16  q10, d1                        \n"                             \
-    "vmovl.s16  q0, d0                         \n"                             \
-    "vmul.s32   q10, q10, q15                  \n"                             \
-    "vmul.s32   q0, q0, q15                    \n"                             \
-    "vqshrun.s32 d0, q0, #16                   \n"                             \
-    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
-    "vadd.s16   d18, d19                       \n"                             \
-    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
-    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
-    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
-    "vaddw.u16  q1, q1, d16                    \n"                             \
-    "vaddw.u16  q10, q10, d17                  \n"                             \
-    "vaddw.u16  q3, q3, d18                    \n"                             \
-    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
-    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
-    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
-    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
-    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
-    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
-    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
-    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
-    "vqshrun.s16 d21, q0, #6                   \n" /* G */
+#define YUVTORGB                                                              \
+  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \
+  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \
+  "vmovl.u8   q0, d0                         \n" /* Y                      */ \
+  "vmovl.s16  q10, d1                        \n"                              \
+  "vmovl.s16  q0, d0                         \n"                              \
+  "vmul.s32   q10, q10, q15                  \n"                              \
+  "vmul.s32   q0, q0, q15                    \n"                              \
+  "vqshrun.s32 d0, q0, #16                   \n"                              \
+  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \
+  "vadd.s16   d18, d19                       \n"                              \
+  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \
+  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \
+  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \
+  "vaddw.u16  q1, q1, d16                    \n"                              \
+  "vaddw.u16  q10, q10, d17                  \n"                              \
+  "vaddw.u16  q3, q3, d18                    \n"                              \
+  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \
+  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \
+  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \
+  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \
+  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \
+  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \
+  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \
+  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \
+  "vqshrun.s16 d21, q0, #6                   \n" /* G */
 
 void I444ToARGBRow_NEON(const uint8* src_y,
                         const uint8* src_u,
@@ -227,36 +218,6 @@
   );
 }
 
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
 void I422ToRGBARow_NEON(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -316,12 +277,12 @@
   );
 }
 
-#define ARGBTORGB565                                                           \
-    "vshll.u8    q0, d22, #8                   \n"  /* R                    */ \
-    "vshll.u8    q8, d21, #8                   \n"  /* G                    */ \
-    "vshll.u8    q9, d20, #8                   \n"  /* B                    */ \
-    "vsri.16     q0, q8, #5                    \n"  /* RG                   */ \
-    "vsri.16     q0, q9, #11                   \n"  /* RGB                  */
+#define ARGBTORGB565                                                        \
+  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \
+  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \
+  "vsri.16     q0, q9, #11                   \n" /* RGB                  */
 
 void I422ToRGB565Row_NEON(const uint8* src_y,
                           const uint8* src_u,
@@ -353,14 +314,14 @@
   );
 }
 
-#define ARGBTOARGB1555                                                         \
-    "vshll.u8    q0, d23, #8                   \n"  /* A                    */ \
-    "vshll.u8    q8, d22, #8                   \n"  /* R                    */ \
-    "vshll.u8    q9, d21, #8                   \n"  /* G                    */ \
-    "vshll.u8    q10, d20, #8                  \n"  /* B                    */ \
-    "vsri.16     q0, q8, #1                    \n"  /* AR                   */ \
-    "vsri.16     q0, q9, #6                    \n"  /* ARG                  */ \
-    "vsri.16     q0, q10, #11                  \n"  /* ARGB                 */
+#define ARGBTOARGB1555                                                      \
+  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \
+  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \
+  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \
+  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \
+  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */
 
 void I422ToARGB1555Row_NEON(const uint8* src_y,
                             const uint8* src_u,
@@ -393,14 +354,14 @@
   );
 }
 
-#define ARGBTOARGB4444                                                         \
-    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
-    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
-    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
-    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
-    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
-    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+#define ARGBTOARGB4444                                                      \
+  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \
+  "vbic.32    d21, d21, d4                   \n" /* G                    */ \
+  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \
+  "vbic.32    d23, d23, d4                   \n" /* A                    */ \
+  "vorr       d0, d20, d21                   \n" /* BG                   */ \
+  "vorr       d1, d22, d23                   \n" /* RA                   */ \
+  "vzip.u8    d0, d1                         \n" /* BGRA                 */
 
 void I422ToARGB4444Row_NEON(const uint8* src_y,
                             const uint8* src_u,
@@ -434,9 +395,7 @@
   );
 }
 
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
   asm volatile (
     YUVTORGB_SETUP
     "vmov.u8    d23, #255                      \n"
@@ -459,9 +418,7 @@
   );
 }
 
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
   asm volatile (
     "vmov.u8    d23, #255                      \n"
   "1:                                          \n"
@@ -618,7 +575,9 @@
 }
 
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8* src_uv,
+                     uint8* dst_u,
+                     uint8* dst_v,
                      int width) {
   asm volatile (
   "1:                                          \n"
@@ -640,7 +599,9 @@
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_uv,
                      int width) {
   asm volatile (
   "1:                                          \n"
@@ -737,7 +698,9 @@
   );
 }
 
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_NEON(const uint8* src_uv,
+                      uint8* dst_u,
+                      uint8* dst_v,
                       int width) {
   asm volatile (
     // Start at end of source row.
@@ -844,17 +807,17 @@
   );
 }
 
-#define RGB565TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
-    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
-    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+#define RGB565TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
+  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
+  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
 
 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
   asm volatile (
@@ -875,34 +838,35 @@
   );
 }
 
-#define ARGB1555TOARGB                                                         \
-    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
-    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
-    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
-    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
-    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
-    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
-    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+#define ARGB1555TOARGB                                                      \
+  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
+  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
+  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
+  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
+  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
+  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
+  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
+  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
+  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
-    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+#define RGB555TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
+  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
+  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
 
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
+                            uint8* dst_argb,
                             int width) {
   asm volatile (
     "vmov.u8    d3, #255                       \n"  // Alpha
@@ -922,17 +886,18 @@
   );
 }
 
-#define ARGB4444TOARGB                                                         \
-    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
-    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
-    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
-    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
-    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
-    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
-    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+#define ARGB4444TOARGB                                                      \
+  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
+  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
+  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
+  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
+  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
+  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
+  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
 
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
+                            uint8* dst_argb,
                             int width) {
   asm volatile (
     "vmov.u8    d3, #255                       \n"  // Alpha
@@ -1021,7 +986,9 @@
   );
 }
 
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+                         uint8* dst_u,
+                         uint8* dst_v,
                          int width) {
   asm volatile (
   "1:                                          \n"
@@ -1042,7 +1009,9 @@
   );
 }
 
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+                         uint8* dst_u,
+                         uint8* dst_v,
                          int width) {
   asm volatile (
   "1:                                          \n"
@@ -1063,8 +1032,11 @@
   );
 }
 
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+                      int stride_yuy2,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // stride + src_yuy2
   "1:                                          \n"
@@ -1090,8 +1062,11 @@
   );
 }
 
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_NEON(const uint8* src_uyvy,
+                      int stride_uyvy,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // stride + src_uyvy
   "1:                                          \n"
@@ -1118,8 +1093,10 @@
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
+void ARGBShuffleRow_NEON(const uint8* src_argb,
+                         uint8* dst_argb,
+                         const uint8* shuffler,
+                         int width) {
   asm volatile (
     MEMACCESS(3)
     "vld1.8     {q2}, [%3]                     \n"  // shuffler
@@ -1143,7 +1120,8 @@
 void I422ToYUY2Row_NEON(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
+                        uint8* dst_yuy2,
+                        int width) {
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -1169,7 +1147,8 @@
 void I422ToUYVYRow_NEON(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
+                        uint8* dst_uyvy,
+                        int width) {
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -1210,8 +1189,10 @@
   );
 }
 
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
+                                uint8* dst_rgb,
+                                const uint32 dither4,
+                                int width) {
   asm volatile (
     "vdup.32    d2, %2                         \n"  // dither4
   "1:                                          \n"
@@ -1233,7 +1214,8 @@
   );
 }
 
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+void ARGBToARGB1555Row_NEON(const uint8* src_argb,
+                            uint8* dst_argb1555,
                             int width) {
   asm volatile (
   "1:                                          \n"
@@ -1252,7 +1234,8 @@
   );
 }
 
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+void ARGBToARGB4444Row_NEON(const uint8* src_argb,
+                            uint8* dst_argb4444,
                             int width) {
   asm volatile (
     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
@@ -1341,7 +1324,9 @@
 }
 
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_NEON(const uint8* src_argb,
+                         uint8* dst_u,
+                         uint8* dst_v,
                          int width) {
   asm volatile (
     "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
@@ -1381,85 +1366,31 @@
   );
 }
 
-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
-    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
-    "vpadd.u16  d1, d8, d9                     \n"  // B
-    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
-    "vpadd.u16  d3, d10, d11                   \n"  // G
-    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
-    "vpadd.u16  d5, d12, d13                   \n"  // R
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
-    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
-    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
-    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
-    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
-    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
-    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
-    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
-    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
-    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "vmul.s16   q8, " #QB                                                     \
+  ", q10               \n" /* B                    */                       \
+  "vmls.s16   q8, " #QG                                                     \
+  ", q11               \n" /* G                    */                       \
+  "vmls.s16   q8, " #QR                                                     \
+  ", q12               \n"                       /* R                    */ \
+  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \
+  "vmul.s16   q9, " #QR                                                     \
+  ", q10               \n" /* R                    */                       \
+  "vmls.s16   q9, " #QG                                                     \
+  ", q14               \n" /* G                    */                       \
+  "vmls.s16   q9, " #QB                                                     \
+  ", q13               \n"                       /* B                    */ \
+  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \
+  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \
+  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVRow_NEON(const uint8* src_argb,
+                      int src_stride_argb,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_argb
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1507,8 +1438,11 @@
 }
 
 // TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_NEON(const uint8* src_argb,
+                       int src_stride_argb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_argb
     "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
@@ -1555,8 +1489,11 @@
   );
 }
 
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_NEON(const uint8* src_bgra,
+                      int src_stride_bgra,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_bgra
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1603,8 +1540,11 @@
   );
 }
 
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ABGRToUVRow_NEON(const uint8* src_abgr,
+                      int src_stride_abgr,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_abgr
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1651,8 +1591,11 @@
   );
 }
 
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_NEON(const uint8* src_rgba,
+                      int src_stride_rgba,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_rgba
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1699,8 +1642,11 @@
   );
 }
 
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1747,8 +1693,11 @@
   );
 }
 
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width) {
+void RAWToUVRow_NEON(const uint8* src_raw,
+                     int src_stride_raw,
+                     uint8* dst_u,
+                     uint8* dst_v,
+                     int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_raw
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1796,8 +1745,11 @@
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width) {
+void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8* dst_u,
+                        uint8* dst_v,
+                        int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_argb
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1865,8 +1817,11 @@
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int width) {
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8* dst_u,
+                          uint8* dst_v,
+                          int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_argb
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1934,8 +1889,11 @@
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width) {
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8* dst_u,
+                          uint8* dst_v,
+                          int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_argb
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -2215,8 +2173,10 @@
 
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
+                         const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   asm volatile (
     "cmp        %4, #0                         \n"
@@ -2280,8 +2240,10 @@
 }
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBBlendRow_NEON(const uint8* src_argb0,
+                       const uint8* src_argb1,
+                       uint8* dst_argb,
+                       int width) {
   asm volatile (
     "subs       %3, #8                         \n"
     "blt        89f                            \n"
@@ -2371,8 +2333,11 @@
 
 // Quantize 8 ARGB pixels (32 bytes).
 // dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
+void ARGBQuantizeRow_NEON(uint8* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
   asm volatile (
     "vdup.u16   q8, %2                         \n"
     "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
@@ -2414,7 +2379,9 @@
 // Shade 8 pixels at a time by specified value.
 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_NEON(const uint8* src_argb,
+                       uint8* dst_argb,
+                       int width,
                        uint32 value) {
   asm volatile (
     "vdup.u32   q0, %3                         \n"  // duplicate scale value.
@@ -2523,8 +2490,10 @@
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
 // needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_NEON(const uint8* src_argb,
+                             uint8* dst_argb,
+                             const int8* matrix_argb,
+                             int width) {
   asm volatile (
     MEMACCESS(3)
     "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
@@ -2584,8 +2553,10 @@
 }
 
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+void ARGBMultiplyRow_NEON(const uint8* src_argb0,
+                          const uint8* src_argb1,
+                          uint8* dst_argb,
+                          int width) {
   asm volatile (
     // 8 pixel loop.
   "1:                                          \n"
@@ -2616,8 +2587,10 @@
 }
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+void ARGBAddRow_NEON(const uint8* src_argb0,
+                     const uint8* src_argb1,
+                     uint8* dst_argb,
+                     int width) {
   asm volatile (
     // 8 pixel loop.
   "1:                                          \n"
@@ -2642,8 +2615,10 @@
 }
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+void ARGBSubtractRow_NEON(const uint8* src_argb0,
+                          const uint8* src_argb1,
+                          uint8* dst_argb,
+                          int width) {
   asm volatile (
     // 8 pixel loop.
   "1:                                          \n"
@@ -2672,8 +2647,10 @@
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+void SobelRow_NEON(const uint8* src_sobelx,
+                   const uint8* src_sobely,
+                   uint8* dst_argb,
+                   int width) {
   asm volatile (
     "vmov.u8    d3, #255                       \n"  // alpha
     // 8 pixel loop.
@@ -2699,8 +2676,10 @@
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
+void SobelToPlaneRow_NEON(const uint8* src_sobelx,
+                          const uint8* src_sobely,
+                          uint8* dst_y,
+                          int width) {
   asm volatile (
     // 16 pixel loop.
   "1:                                          \n"
@@ -2727,8 +2706,10 @@
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+void SobelXYRow_NEON(const uint8* src_sobelx,
+                     const uint8* src_sobely,
+                     uint8* dst_argb,
+                     int width) {
   asm volatile (
     "vmov.u8    d3, #255                       \n"  // alpha
     // 8 pixel loop.
@@ -2755,8 +2736,11 @@
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_NEON(const uint8* src_y0,
+                    const uint8* src_y1,
+                    const uint8* src_y2,
+                    uint8* dst_sobelx,
+                    int width) {
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -2798,8 +2782,10 @@
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
+void SobelYRow_NEON(const uint8* src_y0,
+                    const uint8* src_y1,
+                    uint8* dst_sobely,
+                    int width) {
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -2835,7 +2821,63 @@
   : "cc", "memory", "q0", "q1"  // Clobber List
   );
 }
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
+  asm volatile (
+    "vdup.32    q0, %3                         \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
+    "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+    "vmovl.u16  q2, d2                         \n"  // 8 int's
+    "vmovl.u16  q3, d3                         \n"
+    "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+    "vcvt.f32.u32  q3, q3                      \n"
+    "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
+    "vmul.f32   q3, q3, q0                     \n"
+    "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
+    "vqshrn.u32 d3, q3, #13                    \n"
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "r"(1.9259299444e-34f)    // %3
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// TODO(fbarchard): multiply by element.
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+    "vdup.32    q0, %3                         \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
+    "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+    "vmovl.u16  q2, d2                         \n"  // 8 int's
+    "vmovl.u16  q3, d3                         \n"
+    "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+    "vcvt.f32.u32  q3, q3                      \n"
+    "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
+    "vmul.f32   q3, q3, q0                     \n"
+    "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
+    "vqshrn.u32 d3, q3, #13                    \n"
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "r"(scale * 1.9259299444e-34f)    // %3
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
index 6375d4f..ebd685e 100644
--- a/files/source/row_neon64.cc
+++ b/files/source/row_neon64.cc

@@ -19,28 +19,18 @@
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
+#define READYUV422 \
+  MEMACCESS(0)     \
+  "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
     "ld1        {v1.s}[0], [%1], #4            \n"                             \
     MEMACCESS(2)                                                               \
     "ld1        {v1.s}[1], [%2], #4            \n"
 
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.h}[0], [%1], #2            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v2.h}[1], [%2], #2            \n"                             \
-    "zip1       v1.8b, v2.8b, v2.8b            \n"
-
 // Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
+#define READYUV444 \
+  MEMACCESS(0)     \
+  "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
     "ld1        {v1.d}[0], [%1], #8            \n"                             \
     MEMACCESS(2)                                                               \
@@ -49,15 +39,15 @@
     "rshrn      v1.8b, v1.8h, #1               \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    "movi       v1.8b , #128                   \n"
+#define READYUV400                               \
+  MEMACCESS(0)                                   \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "movi       v1.8b , #128                   \n"
 
 // Read 8 Y and 4 UV from NV12
-#define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
+#define READNV12 \
+  MEMACCESS(0)   \
+  "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
     "ld1        {v2.8b}, [%1], #8              \n"                             \
     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
@@ -65,9 +55,9 @@
     "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 Y and 4 VU from NV21
-#define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
+#define READNV21 \
+  MEMACCESS(0)   \
+  "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
     "ld1        {v2.8b}, [%1], #8              \n"                             \
     "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
@@ -75,57 +65,65 @@
     "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
-    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
-    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READYUY2                                 \
+  MEMACCESS(0)                                   \
+  "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
+  "uzp2       v3.8b, v1.8b, v1.8b            \n" \
+  "uzp1       v1.8b, v1.8b, v1.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
-    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READUYVY                                 \
+  MEMACCESS(0)                                   \
+  "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
+  "orr        v0.8b, v3.8b, v3.8b            \n" \
+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
-#define YUVTORGB_SETUP                                                         \
-    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
-    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
-    "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \
-    "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
+#define YUVTORGB_SETUP                           \
+  "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
+  "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
+  "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
+  "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
+  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
 
-#define YUVTORGB(vR, vG, vB)                                                   \
-    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
-    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
-    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
-    "ushll      v0.4s, v0.4h, #0               \n"                             \
-    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
-    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
-    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
-    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
-    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
-    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
-    "uxtl       v2.8h, v2.8b                   \n"                             \
-    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
-    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
-    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
-    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
-    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
-    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
-    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
-    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
-    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
-    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
-    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
-    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
-    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
-    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
-    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
+#define YUVTORGB(vR, vG, vB)                                        \
+  "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
+  "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
+  "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
+  "ushll      v0.4s, v0.4h, #0               \n"                    \
+  "mul        v3.4s, v3.4s, v31.4s           \n"                    \
+  "mul        v0.4s, v0.4s, v31.4s           \n"                    \
+  "sqshrun    v0.4h, v0.4s, #16              \n"                    \
+  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
+  "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
+  "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
+  "uxtl       v2.8h, v2.8b                   \n"                    \
+  "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
+  "mul        v3.8h, v1.8h, v27.8h           \n"                    \
+  "mul        v5.8h, v1.8h, v29.8h           \n"                    \
+  "mul        v6.8h, v2.8h, v30.8h           \n"                    \
+  "mul        v7.8h, v2.8h, v28.8h           \n"                    \
+  "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
+  "sqadd      " #vB                                                 \
+  ".8h, v24.8h, v0.8h      \n" /* B */                              \
+  "sqadd      " #vG                                                 \
+  ".8h, v25.8h, v0.8h      \n" /* G */                              \
+  "sqadd      " #vR                                                 \
+  ".8h, v26.8h, v0.8h      \n" /* R */                              \
+  "sqadd      " #vB ".8h, " #vB                                     \
+  ".8h, v3.8h  \n" /* B */                                          \
+  "sqsub      " #vG ".8h, " #vG                                     \
+  ".8h, v6.8h  \n" /* G */                                          \
+  "sqadd      " #vR ".8h, " #vR                                     \
+  ".8h, v7.8h  \n" /* R */                                          \
+  "sqshrun    " #vB ".8b, " #vB                                     \
+  ".8h, #6     \n" /* B */                                          \
+  "sqshrun    " #vG ".8b, " #vG                                     \
+  ".8h, #6     \n"                               /* G */            \
+  "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
 
 void I444ToARGBRow_NEON(const uint8* src_y,
                         const uint8* src_u,
@@ -220,36 +218,6 @@
   );
 }
 
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
 void I422ToRGBARow_NEON(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -309,12 +277,12 @@
   );
 }
 
-#define ARGBTORGB565                                                           \
-    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
+#define ARGBTORGB565                                                        \
+  "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
 
 void I422ToRGB565Row_NEON(const uint8* src_y,
                           const uint8* src_u,
@@ -346,14 +314,14 @@
   );
 }
 
-#define ARGBTOARGB1555                                                         \
-    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
-    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
-    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
+#define ARGBTOARGB1555                                                      \
+  "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
+  "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
+  "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
 
 void I422ToARGB1555Row_NEON(const uint8* src_y,
                             const uint8* src_u,
@@ -386,15 +354,15 @@
   );
 }
 
-#define ARGBTOARGB4444                                                         \
-    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
-    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
-    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
-    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
-    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
-    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
-    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
-    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
+#define ARGBTOARGB4444                                                       \
+  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+  "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
+  "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
+  "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
+  "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
+  "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
+  "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
+  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
 
 void I422ToARGB4444Row_NEON(const uint8* src_y,
                             const uint8* src_u,
@@ -428,9 +396,7 @@
   );
 }
 
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
   asm volatile (
     YUVTORGB_SETUP
     "movi       v23.8b, #255                   \n"
@@ -453,9 +419,7 @@
   );
 }
 
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
   asm volatile (
     "movi       v23.8b, #255                   \n"
   "1:                                          \n"
@@ -612,7 +576,9 @@
 }
 
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8* src_uv,
+                     uint8* dst_u,
+                     uint8* dst_v,
                      int width) {
   asm volatile (
   "1:                                          \n"
@@ -634,7 +600,9 @@
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_uv,
                      int width) {
   asm volatile (
   "1:                                          \n"
@@ -728,7 +696,9 @@
   );
 }
 
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_NEON(const uint8* src_uv,
+                      uint8* dst_u,
+                      uint8* dst_v,
                       int width) {
   asm volatile (
     // Start at end of source row.
@@ -834,18 +804,18 @@
   );
 }
 
-#define RGB565TOARGB                                                           \
-    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
-    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
-    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
-    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
-    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
-    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
-    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
-    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
+#define RGB565TOARGB                                                        \
+  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
+  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
+  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
+  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
+  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
+  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
+  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
+  "dup        v2.2D, v0.D[1]                 \n" /* R                    */
 
 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
   asm volatile (
@@ -866,44 +836,45 @@
   );
 }
 
-#define ARGB1555TOARGB                                                         \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
-                                                                               \
-    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
-    "xtn2       v3.16b, v2.8h                  \n"                             \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
-    "dup        v1.2D, v0.D[1]                 \n"                             \
-    "dup        v3.2D, v2.D[1]                 \n"
+#define ARGB1555TOARGB                                                      \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
+                                                                            \
+  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
+  "xtn2       v3.16b, v2.8h                  \n"                            \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
+  "dup        v1.2D, v0.D[1]                 \n"                            \
+  "dup        v3.2D, v2.D[1]                 \n"
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
-    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
+#define RGB555TOARGB                                                        \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
+  "dup        v1.2D, v0.D[1]                 \n" /* G */
 
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
+                            uint8* dst_argb,
                             int width) {
   asm volatile (
     "movi       v3.8b, #255                    \n"  // Alpha
@@ -923,19 +894,20 @@
   );
 }
 
-#define ARGB4444TOARGB                                                         \
-    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
-    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
-    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
-    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
-    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
-    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
-    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
-    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
-    "dup        v0.2D, v2.D[1]                 \n"                             \
-    "dup        v1.2D, v3.D[1]                 \n"
+#define ARGB4444TOARGB                                                      \
+  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
+  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
+  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
+  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
+  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
+  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
+  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
+  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
+  "dup        v0.2D, v2.D[1]                 \n"                            \
+  "dup        v1.2D, v3.D[1]                 \n"
 
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
+                            uint8* dst_argb,
                             int width) {
   asm volatile (
   "1:                                          \n"
@@ -1024,7 +996,9 @@
   );
 }
 
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+                         uint8* dst_u,
+                         uint8* dst_v,
                          int width) {
   asm volatile (
   "1:                                          \n"
@@ -1045,7 +1019,9 @@
   );
 }
 
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+                         uint8* dst_u,
+                         uint8* dst_v,
                          int width) {
   asm volatile (
   "1:                                          \n"
@@ -1066,8 +1042,11 @@
   );
 }
 
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+                      int stride_yuy2,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
   asm volatile (
   "1:                                          \n"
@@ -1094,8 +1073,11 @@
   );
 }
 
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_NEON(const uint8* src_uyvy,
+                      int stride_uyvy,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   const uint8* src_uyvyb = src_uyvy + stride_uyvy;
   asm volatile (
   "1:                                          \n"
@@ -1123,8 +1105,10 @@
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
+void ARGBShuffleRow_NEON(const uint8* src_argb,
+                         uint8* dst_argb,
+                         const uint8* shuffler,
+                         int width) {
   asm volatile (
     MEMACCESS(3)
     "ld1        {v2.16b}, [%3]                 \n"  // shuffler
@@ -1147,7 +1131,8 @@
 void I422ToYUY2Row_NEON(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
+                        uint8* dst_yuy2,
+                        int width) {
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -1174,7 +1159,8 @@
 void I422ToUYVYRow_NEON(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
+                        uint8* dst_uyvy,
+                        int width) {
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -1216,8 +1202,10 @@
   );
 }
 
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
+                                uint8* dst_rgb,
+                                const uint32 dither4,
+                                int width) {
   asm volatile (
     "dup        v1.4s, %w2                     \n"  // dither4
   "1:                                          \n"
@@ -1239,7 +1227,8 @@
   );
 }
 
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+void ARGBToARGB1555Row_NEON(const uint8* src_argb,
+                            uint8* dst_argb1555,
                             int width) {
   asm volatile (
   "1:                                          \n"
@@ -1258,7 +1247,8 @@
   );
 }
 
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+void ARGBToARGB4444Row_NEON(const uint8* src_argb,
+                            uint8* dst_argb4444,
                             int width) {
   asm volatile (
     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
@@ -1346,7 +1336,9 @@
 }
 
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_NEON(const uint8* src_argb,
+                         uint8* dst_u,
+                         uint8* dst_v,
                          int width) {
   asm volatile (
     "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
@@ -1387,83 +1379,41 @@
   );
 }
 
-#define RGBTOUV_SETUP_REG                                                      \
-    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
-    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
-    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
-    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
-    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
-    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
-
-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
-    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
-    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
-    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
-    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
-    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
-    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
-    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
-    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
-    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
+#define RGBTOUV_SETUP_REG                                                  \
+  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
+  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
+  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
+  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
+  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
+  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
-    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
-    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
-    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
-    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
-    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
-    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
-    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "mul        v3.8h, " #QB                                                  \
+  ",v20.8h          \n" /* B                    */                          \
+  "mul        v4.8h, " #QR                                                  \
+  ",v20.8h          \n" /* R                    */                          \
+  "mls        v3.8h, " #QG                                                  \
+  ",v21.8h          \n" /* G                    */                          \
+  "mls        v4.8h, " #QG                                                  \
+  ",v24.8h          \n" /* G                    */                          \
+  "mls        v3.8h, " #QR                                                  \
+  ",v22.8h          \n" /* R                    */                          \
+  "mls        v4.8h, " #QB                                                  \
+  ",v23.8h          \n"                          /* B                    */ \
+  "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
+  "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
 // TODO(fbarchard): consider ptrdiff_t for all strides.
 
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVRow_NEON(const uint8* src_argb,
+                      int src_stride_argb,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   const uint8* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
     RGBTOUV_SETUP_REG
@@ -1503,8 +1453,11 @@
 }
 
 // TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_NEON(const uint8* src_argb,
+                       int src_stride_argb,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   const uint8* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
@@ -1547,8 +1500,11 @@
   );
 }
 
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_NEON(const uint8* src_bgra,
+                      int src_stride_bgra,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
   asm volatile (
     RGBTOUV_SETUP_REG
@@ -1586,8 +1542,11 @@
   );
 }
 
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ABGRToUVRow_NEON(const uint8* src_abgr,
+                      int src_stride_abgr,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
   asm volatile (
     RGBTOUV_SETUP_REG
@@ -1625,8 +1584,11 @@
   );
 }
 
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_NEON(const uint8* src_rgba,
+                      int src_stride_rgba,
+                      uint8* dst_u,
+                      uint8* dst_v,
+                      int width) {
   const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
   asm volatile (
     RGBTOUV_SETUP_REG
@@ -1664,8 +1626,11 @@
   );
 }
 
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8* dst_u,
+                       uint8* dst_v,
+                       int width) {
   const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   asm volatile (
     RGBTOUV_SETUP_REG
@@ -1703,8 +1668,11 @@
   );
 }
 
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width) {
+void RAWToUVRow_NEON(const uint8* src_raw,
+                     int src_stride_raw,
+                     uint8* dst_u,
+                     uint8* dst_v,
+                     int width) {
   const uint8* src_raw_1 = src_raw + src_stride_raw;
   asm volatile (
     RGBTOUV_SETUP_REG
@@ -1743,8 +1711,11 @@
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width) {
+void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8* dst_u,
+                        uint8* dst_v,
+                        int width) {
   const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
   asm volatile (
     "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
@@ -1817,8 +1788,11 @@
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int width) {
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8* dst_u,
+                          uint8* dst_v,
+                          int width) {
   const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
   asm volatile (
     RGBTOUV_SETUP_REG
@@ -1886,8 +1860,11 @@
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width) {
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8* dst_u,
+                          uint8* dst_v,
+                          int width) {
   const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
   asm volatile (
     RGBTOUV_SETUP_REG
@@ -2169,8 +2146,10 @@
 
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
+                         const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint8* src_ptr1 = src_ptr + src_stride;
@@ -2235,8 +2214,10 @@
 }
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBBlendRow_NEON(const uint8* src_argb0,
+                       const uint8* src_argb1,
+                       uint8* dst_argb,
+                       int width) {
   asm volatile (
     "subs       %w3, %w3, #8                   \n"
     "b.lt       89f                            \n"
@@ -2331,8 +2312,11 @@
 
 // Quantize 8 ARGB pixels (32 bytes).
 // dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
+void ARGBQuantizeRow_NEON(uint8* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
   asm volatile (
     "dup        v4.8h, %w2                     \n"
     "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
@@ -2374,7 +2358,9 @@
 // Shade 8 pixels at a time by specified value.
 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+void ARGBShadeRow_NEON(const uint8* src_argb,
+                       uint8* dst_argb,
+                       int width,
                        uint32 value) {
   asm volatile (
     "dup        v0.4s, %w3                     \n"  // duplicate scale value.
@@ -2484,8 +2470,10 @@
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
 // needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_NEON(const uint8* src_argb,
+                             uint8* dst_argb,
+                             const int8* matrix_argb,
+                             int width) {
   asm volatile (
     MEMACCESS(3)
     "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
@@ -2546,8 +2534,10 @@
 
 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+void ARGBMultiplyRow_NEON(const uint8* src_argb0,
+                          const uint8* src_argb1,
+                          uint8* dst_argb,
+                          int width) {
   asm volatile (
     // 8 pixel loop.
   "1:                                          \n"
@@ -2578,8 +2568,10 @@
 }
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+void ARGBAddRow_NEON(const uint8* src_argb0,
+                     const uint8* src_argb1,
+                     uint8* dst_argb,
+                     int width) {
   asm volatile (
     // 8 pixel loop.
   "1:                                          \n"
@@ -2606,8 +2598,10 @@
 }
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+void ARGBSubtractRow_NEON(const uint8* src_argb0,
+                          const uint8* src_argb1,
+                          uint8* dst_argb,
+                          int width) {
   asm volatile (
     // 8 pixel loop.
   "1:                                          \n"
@@ -2638,8 +2632,10 @@
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+void SobelRow_NEON(const uint8* src_sobelx,
+                   const uint8* src_sobely,
+                   uint8* dst_argb,
+                   int width) {
   asm volatile (
     "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
@@ -2665,8 +2661,10 @@
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
+void SobelToPlaneRow_NEON(const uint8* src_sobelx,
+                          const uint8* src_sobely,
+                          uint8* dst_y,
+                          int width) {
   asm volatile (
     // 16 pixel loop.
   "1:                                          \n"
@@ -2693,8 +2691,10 @@
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+void SobelXYRow_NEON(const uint8* src_sobelx,
+                     const uint8* src_sobely,
+                     uint8* dst_argb,
+                     int width) {
   asm volatile (
     "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
@@ -2721,8 +2721,11 @@
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_NEON(const uint8* src_y0,
+                    const uint8* src_y1,
+                    const uint8* src_y2,
+                    uint8* dst_sobelx,
+                    int width) {
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -2764,8 +2767,10 @@
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
+void SobelYRow_NEON(const uint8* src_y0,
+                    const uint8* src_y1,
+                    uint8* dst_sobely,
+                    int width) {
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -2801,6 +2806,56 @@
   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+    "uxtl2      v3.4s, v1.8h                   \n"
+    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+    "scvtf      v3.4s, v3.4s                   \n"
+    "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
+    "fcvtn2     v1.8h, v3.4s                   \n"
+   MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+    "b.gt       1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3"
+  );
+}
+
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+    "uxtl2      v3.4s, v1.8h                   \n"
+    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+    "scvtf      v3.4s, v3.4s                   \n"
+    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
+    "fmul       v3.4s, v3.4s, %3.s[0]          \n"
+    "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
+    "uqshrn2    v1.8h, v3.4s, #13              \n"
+   MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+    "b.gt       1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "w"(scale * 1.9259299444e-34f)    // %3
+  : "cc", "memory", "v1", "v2", "v3"
+  );
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus

diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index 2a3da89..202f2b8 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc

@@ -28,61 +28,60 @@
 #if defined(_M_X64)
 
 // Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422                                                             \
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
-    u_buf += 4;                                                                \
-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
-    y_buf += 8;
+#define READYUV422                                      \
+  xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                 \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                \
+  u_buf += 4;                                           \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);              \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                 \
+  y_buf += 8;
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                                            \
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
-    u_buf += 4;                                                                \
-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
-    y_buf += 8;                                                                \
-    xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \
-    a_buf += 8;
+#define READYUVA422                                     \
+  xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                 \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                \
+  u_buf += 4;                                           \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);              \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                 \
+  y_buf += 8;                                           \
+  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);              \
+  a_buf += 8;
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants)                                                 \
-    xmm1 = _mm_loadu_si128(&xmm0);                                             \
-    xmm2 = _mm_loadu_si128(&xmm0);                                             \
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \
-    xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \
-    xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \
-    xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \
-    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \
-    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
-    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
-    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
-    xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
-    xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
-    xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
-    xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \
-    xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \
-    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+#define YUVTORGB(yuvconstants)                                     \
+  xmm1 = _mm_loadu_si128(&xmm0);                                   \
+  xmm2 = _mm_loadu_si128(&xmm0);                                   \
+  xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+  xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+  xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+  xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
+  xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
+  xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
+  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
+  xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
+  xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
+  xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
+  xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
+  xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
+  xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
+  xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
+  xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
+  xmm2 = _mm_packus_epi16(xmm2, xmm2);
 
 // Store 8 ARGB values.
-#define STOREARGB                                                              \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \
-    xmm1 = _mm_loadu_si128(&xmm0);                                             \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \
-    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \
-    _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \
-    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \
-    dst_argb += 32;
-
+#define STOREARGB                                    \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
+  xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
+  xmm1 = _mm_loadu_si128(&xmm0);                     \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
+  xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
+  _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
+  _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
+  dst_argb += 32;
 
 #if defined(HAS_I422TOARGBROW_SSSE3)
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
@@ -127,175 +126,143 @@
 #ifdef HAS_ARGBTOYROW_SSSE3
 
 // Constants for ARGB.
-static const vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+                              13, 65, 33, 0, 13, 65, 33, 0};
 
 // JPeg full range.
-static const vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+                               15, 75, 38, 0, 15, 75, 38, 0};
 
-static const vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
 
-static const vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
 
 static const vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
 
-static const vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
 
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
 
 // Constants for BGRA.
-static const vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+                              0, 33, 65, 13, 0, 33, 65, 13};
 
-static const vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
 
-static const vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
 
 // Constants for ABGR.
-static const vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+                              33, 65, 13, 0, 33, 65, 13, 0};
 
-static const vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
 
-static const vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
 
 // Constants for RGBA.
-static const vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+                              0, 13, 65, 33, 0, 13, 65, 33};
 
-static const vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
 
-static const vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
 
-static const uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
 
 // 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
 
-static const uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
-static const uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
 
 // Shuffle table for converting RGB24 to ARGB.
 static const uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
 
 // Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
 
 // Shuffle table for converting RAW to RGB24.  First 8.
 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Middle 8.
 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Last 8.
 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RGB24.
 static const uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RAW.
 static const uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
 
 // YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
 
 // YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
 
 // UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
 
 // UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
 
 // NV21 shuf 8 VU to 16 UV.
 static const lvec8 kShuffleNV21 = {
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
 };
 
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
+                                          uint8* dst_argb,
+                                          int width) {
   __asm {
-    mov        eax, [esp + 4]        // src_y
-    mov        edx, [esp + 8]        // dst_argb
-    mov        ecx, [esp + 12]       // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src_y
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
     pslld      xmm5, 24
 
   convertloop:
@@ -318,13 +285,14 @@
 
 #ifdef HAS_J400TOARGBROW_AVX2
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
+                                          uint8* dst_argb,
+                                          int width) {
   __asm {
-    mov         eax, [esp + 4]        // src_y
-    mov         edx, [esp + 8]        // dst_argb
-    mov         ecx, [esp + 12]       // width
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+    mov         eax, [esp + 4]  // src_y
+    mov         edx, [esp + 8]  // dst_argb
+    mov         ecx, [esp + 12]  // width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
     vpslld      ymm5, ymm5, 24
 
   convertloop:
@@ -348,13 +316,14 @@
 }
 #endif  // HAS_J400TOARGBROW_AVX2
 
-__declspec(naked)
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
+                                            uint8* dst_argb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_rgb24
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_rgb24
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
     pslld     xmm5, 24
     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
 
@@ -364,17 +333,17 @@
     movdqu    xmm3, [eax + 32]
     lea       eax, [eax + 48]
     movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
     pshufb    xmm2, xmm4
     por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
     movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
     movdqu    [edx], xmm0
     por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
     movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
@@ -386,14 +355,14 @@
   }
 }
 
-__declspec(naked)
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
-                        int width) {
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
+                                          uint8* dst_argb,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
     pslld     xmm5, 24
     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
 
@@ -403,17 +372,17 @@
     movdqu    xmm3, [eax + 32]
     lea       eax, [eax + 48]
     movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
     pshufb    xmm2, xmm4
     por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
     movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
     movdqu    [edx], xmm0
     por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
     movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
@@ -425,11 +394,12 @@
   }
 }
 
-__declspec(naked)
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
+                                           uint8* dst_rgb24,
+                                           int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_rgb24
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_rgb24
     mov       ecx, [esp + 12]  // width
     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
@@ -460,9 +430,9 @@
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions.
-__declspec(naked)
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                          int width) {
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
+                                            uint8* dst_argb,
+                                            int width) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
@@ -470,33 +440,33 @@
     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
     movd      xmm6, eax
     pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
     psllw     xmm3, 11
-    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
     psllw     xmm4, 10
     psrlw     xmm4, 5
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
     psllw     xmm7, 8
 
-    mov       eax, [esp + 4]   // src_rgb565
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_rgb565
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
     movdqa    xmm1, xmm0
     movdqa    xmm2, xmm0
-    pand      xmm1, xmm3    // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pand      xmm1, xmm3  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
     psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
-    pand      xmm0, xmm4    // G in middle 6 bits
-    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
-    por       xmm0, xmm7    // AG
+    por       xmm1, xmm2  // RB
+    pand      xmm0, xmm4  // G in middle 6 bits
+    pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
+    por       xmm0, xmm7  // AG
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
@@ -516,9 +486,9 @@
 // v * 256 + v * 8
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked)
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
-                          int width) {
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
+                                            uint8* dst_argb,
+                                            int width) {
   __asm {
     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
     vmovd      xmm5, eax
@@ -526,32 +496,32 @@
     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
     vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
-    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
     vpsllw     ymm4, ymm4, 10
     vpsrlw     ymm4, ymm4, 5
-    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
     vpsllw     ymm7, ymm7, 8
 
-    mov        eax, [esp + 4]   // src_rgb565
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_rgb565
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     sub        edx, eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
-    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
     vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
-    vpor       ymm0, ymm0, ymm7    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm1, ymm1, 0xd8
     vpunpckhbw ymm2, ymm1, ymm0
     vpunpcklbw ymm1, ymm1, ymm0
@@ -567,9 +537,9 @@
 #endif  // HAS_RGB565TOARGBROW_AVX2
 
 #ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked)
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
+                                              uint8* dst_argb,
+                                              int width) {
   __asm {
     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
     vmovd      xmm5, eax
@@ -577,33 +547,33 @@
     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
     vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
-    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
-    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
     vpsllw     ymm7, ymm7, 8
 
-    mov        eax,  [esp + 4]   // src_argb1555
-    mov        edx,  [esp + 8]   // dst_argb
+    mov        eax,  [esp + 4]  // src_argb1555
+    mov        edx,  [esp + 8]  // dst_argb
     mov        ecx,  [esp + 12]  // width
     sub        edx,  eax
     sub        edx,  eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
-    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
     vpand      ymm1, ymm1, ymm3
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
     vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpsraw     ymm2, ymm0, 8       // A
-    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpsraw     ymm2, ymm0, 8  // A
+    vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
     vpand      ymm2, ymm2, ymm7
-    vpor       ymm0, ymm0, ymm2    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpor       ymm0, ymm0, ymm2  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm1, ymm1, 0xd8
     vpunpckhbw ymm2, ymm1, ymm0
     vpunpcklbw ymm1, ymm1, ymm0
@@ -619,29 +589,29 @@
 #endif  // HAS_ARGB1555TOARGBROW_AVX2
 
 #ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked)
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
+                                              uint8* dst_argb,
+                                              int width) {
   __asm {
     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
     vmovd     xmm4, eax
     vbroadcastss ymm4, xmm4
-    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
-    mov       eax,  [esp + 4]   // src_argb4444
-    mov       edx,  [esp + 8]   // dst_argb
+    vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]  // src_argb4444
+    mov       edx,  [esp + 8]  // dst_argb
     mov       ecx,  [esp + 12]  // width
     sub       edx,  eax
     sub       edx,  eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
-    vpand      ymm2, ymm0, ymm5    // mask high nibbles
-    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5  // mask high nibbles
+    vpand      ymm0, ymm0, ymm4  // mask low nibbles
     vpsrlw     ymm3, ymm2, 4
     vpsllw     ymm1, ymm0, 4
     vpor       ymm2, ymm2, ymm3
     vpor       ymm0, ymm0, ymm1
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm2, ymm2, 0xd8
     vpunpckhbw ymm1, ymm0, ymm2
     vpunpcklbw ymm0, ymm0, ymm2
@@ -657,9 +627,9 @@
 #endif  // HAS_ARGB4444TOARGBROW_AVX2
 
 // 24 instructions
-__declspec(naked)
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
+                                              uint8* dst_argb,
+                                              int width) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
@@ -667,36 +637,36 @@
     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
     movd      xmm6, eax
     pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
     psllw     xmm3, 11
-    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
+    movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
     psrlw     xmm4, 6
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
     psllw     xmm7, 8
 
-    mov       eax, [esp + 4]   // src_argb1555
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_argb1555
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
     movdqa    xmm1, xmm0
     movdqa    xmm2, xmm0
-    psllw     xmm1, 1       // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
+    psllw     xmm1, 1  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
     pand      xmm1, xmm3
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
     psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
+    por       xmm1, xmm2  // RB
     movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // G in middle 5 bits
-    psraw     xmm2, 8       // A
-    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm0, xmm4  // G in middle 5 bits
+    psraw     xmm2, 8  // A
+    pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
     pand      xmm2, xmm7
-    por       xmm0, xmm2    // AG
+    por       xmm0, xmm2  // AG
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
@@ -710,26 +680,26 @@
 }
 
 // 18 instructions.
-__declspec(naked)
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
+                                              uint8* dst_argb,
+                                              int width) {
   __asm {
     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
     movd      xmm4, eax
     pshufd    xmm4, xmm4, 0
-    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
+    movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
     pslld     xmm5, 4
-    mov       eax, [esp + 4]   // src_argb4444
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_argb4444
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
     movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // mask low nibbles
-    pand      xmm2, xmm5    // mask high nibbles
+    pand      xmm0, xmm4  // mask low nibbles
+    pand      xmm2, xmm5  // mask high nibbles
     movdqa    xmm1, xmm0
     movdqa    xmm3, xmm2
     psllw     xmm1, 4
@@ -748,37 +718,38 @@
   }
 }
 
-__declspec(naked)
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
+                                            uint8* dst_rgb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
     movdqu    xmm2, [eax + 32]
     movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
     pshufb    xmm2, xmm6
     pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
     movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -786,37 +757,38 @@
   }
 }
 
-__declspec(naked)
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
+                                          uint8* dst_rgb,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
     movdqu    xmm2, [eax + 32]
     movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
     pshufb    xmm2, xmm6
     pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
     movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -824,33 +796,34 @@
   }
 }
 
-__declspec(naked)
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
+                                            uint8* dst_rgb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
     psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
     psrld     xmm4, 26
     pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
     pslld     xmm5, 11
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
@@ -861,41 +834,42 @@
   }
 }
 
-__declspec(naked)
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
+                                                  uint8* dst_rgb,
+                                                  const uint32 dither4,
+                                                  int width) {
   __asm {
 
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    movd      xmm6, [esp + 12] // dither4
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    movd      xmm6, [esp + 12]  // dither4
     mov       ecx, [esp + 16]  // width
-    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    punpcklbw xmm6, xmm6  // make dither 16 bytes
     movdqa    xmm7, xmm6
     punpcklwd xmm6, xmm6
     punpckhwd xmm7, xmm7
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
     psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
     psrld     xmm4, 26
     pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
     pslld     xmm5, 11
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    paddusb   xmm0, xmm6    // add dither
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6  // add dither
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
@@ -907,39 +881,40 @@
 }
 
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked)
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
+                                                  uint8* dst_rgb,
+                                                  const uint32 dither4,
+                                                  int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
     vbroadcastss xmm6, [esp + 12]  // dither4
-    mov        ecx, [esp + 16]     // width
-    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    mov        ecx, [esp + 16]  // width
+    vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
     vpermq     ymm6, ymm6, 0xd8
     vpunpcklwd ymm6, ymm6, ymm6
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
     vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
     vpsrld     ymm4, ymm4, 26
     vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpaddusb   ymm0, ymm0, ymm6    // add dither
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6  // add dither
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
     vpackusdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -950,37 +925,38 @@
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
 // TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked)
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
+                                              uint8* dst_rgb,
+                                              int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
+    pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
     psrld     xmm4, 27
-    movdqa    xmm5, xmm4       // generate mask 0x000003e0
+    movdqa    xmm5, xmm4  // generate mask 0x000003e0
     pslld     xmm5, 5
-    movdqa    xmm6, xmm4       // generate mask 0x00007c00
+    movdqa    xmm6, xmm4  // generate mask 0x00007c00
     pslld     xmm6, 10
-    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
+    pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
     pslld     xmm7, 15
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    movdqa    xmm3, xmm0    // R
-    psrad     xmm0, 16      // A
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 6       // G
-    psrld     xmm3, 9       // R
-    pand      xmm0, xmm7    // A
-    pand      xmm1, xmm4    // B
-    pand      xmm2, xmm5    // G
-    pand      xmm3, xmm6    // R
-    por       xmm0, xmm1    // BA
-    por       xmm2, xmm3    // GR
-    por       xmm0, xmm2    // BGRA
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    movdqa    xmm3, xmm0  // R
+    psrad     xmm0, 16  // A
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 6  // G
+    psrld     xmm3, 9  // R
+    pand      xmm0, xmm7  // A
+    pand      xmm1, xmm4  // B
+    pand      xmm2, xmm5  // G
+    pand      xmm3, xmm6  // R
+    por       xmm0, xmm1  // BA
+    por       xmm2, xmm3  // GR
+    por       xmm0, xmm2  // BGRA
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
@@ -991,22 +967,23 @@
   }
 }
 
-__declspec(naked)
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
+                                              uint8* dst_rgb,
+                                              int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
     psllw     xmm4, 12
-    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    movdqa    xmm3, xmm4  // generate mask 0x00f000f0
     psrlw     xmm3, 8
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
     movdqa    xmm1, xmm0
-    pand      xmm0, xmm3    // low nibble
-    pand      xmm1, xmm4    // high nibble
+    pand      xmm0, xmm3  // low nibble
+    pand      xmm1, xmm4  // high nibble
     psrld     xmm0, 4
     psrld     xmm1, 8
     por       xmm0, xmm1
@@ -1021,33 +998,34 @@
 }
 
 #ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked)
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
+                                            uint8* dst_rgb,
+                                            int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // width
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
     vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
     vpsrld     ymm4, ymm4, 26
     vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
     vpackusdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1058,36 +1036,37 @@
 #endif  // HAS_ARGBTORGB565ROW_AVX2
 
 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked)
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
+                                              uint8* dst_rgb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // width
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm4, ymm4, ymm4
-    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
-    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
-    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
-    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
     vpslld     ymm7, ymm7, 15
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm3, ymm0, 9       // R
-    vpsrld     ymm2, ymm0, 6       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrad     ymm0, ymm0, 16      // A
-    vpand      ymm3, ymm3, ymm6    // R
-    vpand      ymm2, ymm2, ymm5    // G
-    vpand      ymm1, ymm1, ymm4    // B
-    vpand      ymm0, ymm0, ymm7    // A
-    vpor       ymm0, ymm0, ymm1    // BA
-    vpor       ymm2, ymm2, ymm3    // GR
-    vpor       ymm0, ymm0, ymm2    // BGRA
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9  // R
+    vpsrld     ymm2, ymm0, 6  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrad     ymm0, ymm0, 16  // A
+    vpand      ymm3, ymm3, ymm6  // R
+    vpand      ymm2, ymm2, ymm5  // G
+    vpand      ymm1, ymm1, ymm4  // B
+    vpand      ymm0, ymm0, ymm7  // A
+    vpor       ymm0, ymm0, ymm1  // BA
+    vpor       ymm2, ymm2, ymm3  // GR
+    vpor       ymm0, ymm0, ymm2  // BGRA
     vpackssdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1098,27 +1077,28 @@
 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
 
 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked)
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
+                                              uint8* dst_rgb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_rgb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
     mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
     vpsllw     ymm4, ymm4, 12
-    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+    vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpand      ymm1, ymm0, ymm4    // high nibble
-    vpand      ymm0, ymm0, ymm3    // low nibble
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4  // high nibble
+    vpand      ymm0, ymm0, ymm3  // low nibble
     vpsrld     ymm1, ymm1, 8
     vpsrld     ymm0, ymm0, 4
     vpor       ymm0, ymm0, ymm1
     vpackuswb  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1129,12 +1109,13 @@
 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
 
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked)
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
+                                        uint8* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kARGBToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1164,12 +1145,13 @@
 
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked)
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
+                                         uint8* dst_y,
+                                         int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kARGBToYJ
     movdqa     xmm5, xmmword ptr kAddYJ64
 
@@ -1200,17 +1182,16 @@
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
+                                       uint8* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     vbroadcastf128 ymm4, xmmword ptr kARGBToY
     vbroadcastf128 ymm5, xmmword ptr kAddY16
     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
@@ -1244,12 +1225,13 @@
 
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
+                                        uint8* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
@@ -1283,12 +1265,13 @@
 }
 #endif  //  HAS_ARGBTOYJROW_AVX2
 
-__declspec(naked)
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
+                                        uint8* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kBGRAToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1316,12 +1299,13 @@
   }
 }
 
-__declspec(naked)
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
+                                        uint8* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kABGRToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1349,12 +1333,13 @@
   }
 }
 
-__declspec(naked)
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
+                                        uint8* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kRGBAToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1382,24 +1367,26 @@
   }
 }
 
-__declspec(naked)
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
+                                         int src_stride_argb,
+                                         uint8* dst_u,
+                                         uint8* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1437,11 +1424,11 @@
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1452,24 +1439,26 @@
   }
 }
 
-__declspec(naked)
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
+                                          int src_stride_argb,
+                                          uint8* dst_u,
+                                          uint8* dst_v,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUVJ128
     movdqa     xmm6, xmmword ptr kARGBToVJ
     movdqa     xmm7, xmmword ptr kARGBToUJ
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1511,8 +1500,8 @@
     packsswb   xmm0, xmm1
 
     // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1524,24 +1513,26 @@
 }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked)
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
+                                        int src_stride_argb,
+                                        uint8* dst_u,
+                                        uint8* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     vbroadcastf128 ymm5, xmmword ptr kAddUV128
     vbroadcastf128 ymm6, xmmword ptr kARGBToV
     vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx   // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     vmovdqu    ymm2, [eax + 64]
@@ -1575,8 +1566,8 @@
     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
 
     // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0 // U
-    vextractf128 [edx + edi], ymm0, 1 // V
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -1590,24 +1581,26 @@
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked)
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
+                                         int src_stride_argb,
+                                         uint8* dst_u,
+                                         uint8* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     vbroadcastf128 ymm5, xmmword ptr kAddUV128
     vbroadcastf128 ymm6, xmmword ptr kARGBToV
     vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx   // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     vmovdqu    ymm2, [eax + 64]
@@ -1642,8 +1635,8 @@
     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
 
     // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0 // U
-    vextractf128 [edx + edi], ymm0, 1 // V
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -1656,23 +1649,24 @@
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
-__declspec(naked)
-void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
-                          uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+                                            uint8* dst_u,
+                                            uint8* dst_v,
+                                            int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        edx, [esp + 4 + 8]  // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx    // stride from u to v
 
  convertloop:
-    /* convert to U and V */
-    movdqu     xmm0, [eax]          // U
+        /* convert to U and V */
+    movdqu     xmm0, [eax]  // U
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
     movdqu     xmm3, [eax + 48]
@@ -1688,7 +1682,7 @@
     paddb      xmm0, xmm5
     movdqu     [edx], xmm0
 
-    movdqu     xmm0, [eax]          // V
+    movdqu     xmm0, [eax]  // V
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
     movdqu     xmm3, [eax + 48]
@@ -1713,24 +1707,26 @@
   }
 }
 
-__declspec(naked)
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
+                                         int src_stride_argb,
+                                         uint8* dst_u,
+                                         uint8* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kBGRAToV
     movdqa     xmm7, xmmword ptr kBGRAToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1768,11 +1764,11 @@
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1783,24 +1779,26 @@
   }
 }
 
-__declspec(naked)
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
+                                         int src_stride_argb,
+                                         uint8* dst_u,
+                                         uint8* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kABGRToV
     movdqa     xmm7, xmmword ptr kABGRToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1838,11 +1836,11 @@
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1853,24 +1851,26 @@
   }
 }
 
-__declspec(naked)
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
+                                         int src_stride_argb,
+                                         uint8* dst_u,
+                                         uint8* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kRGBAToV
     movdqa     xmm7, xmmword ptr kRGBAToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1908,11 +1908,11 @@
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1925,109 +1925,95 @@
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 // Read 16 UV from 444
-#define READYUV444_AVX2 __asm {                                                \
-    __asm vmovdqu    xmm0, [esi]                  /* U */                      \
-    __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \
+#define READYUV444_AVX2 \
+  __asm {                                                \
+    __asm vmovdqu    xmm0, [esi] /* U */                      \
+    __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 __asm {                                                \
-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+#define READYUV422_AVX2 \
+  __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2 __asm {                                               \
-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+#define READYUVA422_AVX2 \
+  __asm {                                               \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp]                  /* A */                      \
+    __asm vmovdqu    xmm5, [ebp] /* A */                      \
     __asm vpermq     ymm5, ymm5, 0xd8                                          \
-    __asm lea        ebp, [ebp + 16]                                           \
-  }
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2 __asm {                                                \
-    __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \
-    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        ebp, [ebp + 16]}
 
 // Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+#define READNV12_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from NV21, upsample to 16 UV.
-#define READNV21_AVX2 __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+#define READNV21_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
+#define READYUY2_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
-    __asm lea        eax, [eax + 32]                                           \
-  }
+    __asm lea        eax, [eax + 32]}
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \
+#define READUYVY_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
-    __asm lea        eax, [eax + 32]                                           \
-  }
+    __asm lea        eax, [eax + 32]}
 
 // Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+#define YUVTORGB_AVX2(YuvConstants) \
+  __asm {                                    \
     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
@@ -2036,68 +2022,67 @@
     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
     __asm vpsubw     ymm1, ymm3, ymm1                                          \
     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
-    __asm vpsubw     ymm0, ymm3, ymm0                                          \
-    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
-    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
-    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
+    __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
     __asm vpsraw     ymm0, ymm0, 6                                             \
     __asm vpsraw     ymm1, ymm1, 6                                             \
     __asm vpsraw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
-    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
-    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+    __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
   }
 
 // Store 16 ARGB values.
-#define STOREARGB_AVX2 __asm {                                                 \
-    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+#define STOREARGB_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
-    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
     __asm vmovdqu    0[edx], ymm1                                              \
     __asm vmovdqu    32[edx], ymm0                                             \
-    __asm lea        edx,  [edx + 64]                                          \
-  }
+    __asm lea        edx,  [edx + 64]}
 
 // Store 16 RGBA values.
-#define STORERGBA_AVX2 __asm {                                                 \
-    __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \
+#define STORERGBA_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \
+    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \
-    __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \
+    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
+    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
     __asm vmovdqu    [edx], ymm0                                               \
     __asm vmovdqu    [edx + 32], ymm1                                          \
-    __asm lea        edx,  [edx + 64]                                          \
-  }
+    __asm lea        edx,  [edx + 64]}
 
 #ifdef HAS_I422TOARGBROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I422ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void I422ToARGBRow_AVX2(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUV422_AVX2
@@ -2119,21 +2104,21 @@
 #ifdef HAS_I422ALPHATOARGBROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
+__declspec(naked) void I422AlphaToARGBRow_AVX2(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    const uint8* a_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
     push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
     mov        edi, [esp + 16 + 12]  // V
     mov        ebp, [esp + 16 + 16]  // A
     mov        edx, [esp + 16 + 20]  // argb
@@ -2162,25 +2147,25 @@
 #ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I444ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void I444ToARGBRow_AVX2(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
  convertloop:
     READYUV444_AVX2
     YUVTORGB_AVX2(ebx)
@@ -2198,64 +2183,24 @@
 }
 #endif  // HAS_I444TOARGBROW_AVX2
 
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I411ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // abgr
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV411_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I411TOARGBROW_AVX2
-
 #ifdef HAS_NV12TOARGBROW_AVX2
 // 16 pixels.
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV12ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void NV12ToARGBRow_AVX2(
+    const uint8* y_buf,
+    const uint8* uv_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // UV
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READNV12_AVX2
@@ -2276,21 +2221,21 @@
 #ifdef HAS_NV21TOARGBROW_AVX2
 // 16 pixels.
 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV21ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* vu_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void NV21ToARGBRow_AVX2(
+    const uint8* y_buf,
+    const uint8* vu_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // VU
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READNV21_AVX2
@@ -2311,18 +2256,18 @@
 #ifdef HAS_YUY2TOARGBROW_AVX2
 // 16 pixels.
 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void YUY2ToARGBRow_AVX2(
+    const uint8* src_yuy2,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // yuy2
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUY2_AVX2
@@ -2342,18 +2287,18 @@
 #ifdef HAS_UYVYTOARGBROW_AVX2
 // 16 pixels.
 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void UYVYToARGBRow_AVX2(
+    const uint8* src_uyvy,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // uyvy
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READUYVY_AVX2
@@ -2373,25 +2318,25 @@
 #ifdef HAS_I422TORGBAROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-__declspec(naked)
-void I422ToRGBARow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void I422ToRGBARow_AVX2(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // abgr
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUV422_AVX2
@@ -2415,100 +2360,83 @@
 // Allows a conversion with half size scaling.
 
 // Read 8 UV from 444.
-#define READYUV444 __asm {                                                     \
+#define READYUV444 \
+  __asm {                                                     \
     __asm movq       xmm0, qword ptr [esi] /* U */                             \
     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 __asm {                                                     \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
+#define READYUV422 \
+  __asm {                                                     \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422 __asm {                                                    \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
+#define READYUVA422 \
+  __asm {                                                    \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax] /* Y */                           \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
-    __asm lea        ebp, [ebp + 8]                                            \
-  }
-
-// Read 2 UV from 411, upsample to 8 UV.
-// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
-//  __asm pinsrw     xmm0, [esi], 0        /* U */
-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
-#define READYUV411_EBX __asm {                                                 \
-    __asm movzx      ebx, word ptr [esi]        /* U */                        \
-    __asm movd       xmm0, ebx                                                 \
-    __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \
-    __asm movd       xmm1, ebx                                                 \
-    __asm lea        esi,  [esi + 2]                                           \
-    __asm punpcklbw  xmm0, xmm1            /* UV */                            \
-    __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
-    __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm movq       xmm5, qword ptr [ebp] /* A */                           \
+    __asm lea        ebp, [ebp + 8]}
 
 // Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 __asm {                                                       \
+#define READNV12 \
+  __asm {                                                       \
     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 VU from NV21, upsample to 8 UV.
-#define READNV21 __asm {                                                       \
+#define READNV21 \
+  __asm {                                                       \
     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
-#define READYUY2 __asm {                                                       \
-    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
+#define READYUY2 \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* YUY2 */                           \
     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
-    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
-#define READUYVY __asm {                                                       \
-    __asm movdqu     xmm4, [eax]          /* UYVY */                           \
+#define READUYVY \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* UYVY */                           \
     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
-    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(YuvConstants) __asm {                                         \
+#define YUVTORGB(YuvConstants) \
+  __asm {                                         \
     __asm movdqa     xmm1, xmm0                                                \
     __asm movdqa     xmm2, xmm0                                                \
     __asm movdqa     xmm3, xmm0                                                \
@@ -2522,129 +2450,125 @@
     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
     __asm psubw      xmm2, xmm3                                                \
     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm paddsw     xmm0, xmm4           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm4           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm4           /* R += Y */                         \
+    __asm paddsw     xmm0, xmm4 /* B += Y */                         \
+    __asm paddsw     xmm1, xmm4 /* G += Y */                         \
+    __asm paddsw     xmm2, xmm4 /* R += Y */                         \
     __asm psraw      xmm0, 6                                                   \
     __asm psraw      xmm1, 6                                                   \
     __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0           /* B */                              \
-    __asm packuswb   xmm1, xmm1           /* G */                              \
-    __asm packuswb   xmm2, xmm2           /* R */                              \
+    __asm packuswb   xmm0, xmm0 /* B */                              \
+    __asm packuswb   xmm1, xmm1 /* G */                              \
+    __asm packuswb   xmm2, xmm2 /* R */             \
   }
 
 // Store 8 ARGB values.
-#define STOREARGB __asm {                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
+#define STOREARGB \
+  __asm {                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5 /* RA */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
     __asm movdqu     0[edx], xmm0                                              \
     __asm movdqu     16[edx], xmm1                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 BGRA values.
-#define STOREBGRA __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
-    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+#define STOREBGRA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0 /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2 /* AR */                             \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGBA values.
-#define STORERGBA __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
-    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+#define STORERGBA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2 /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0 /* AB */                             \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGB24 values.
-#define STORERGB24 __asm {                                                     \
-    /* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+#define STORERGB24 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* RRGB -> RGB24 */                                                        \
-    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
-    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
-    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
-    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
-    __asm lea        edx,  [edx + 24]                                          \
-  }
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
+    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]}
 
 // Store 8 RGB565 values.
-#define STORERGB565 __asm {                                                    \
-    /* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+#define STORERGB565 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* RRGB -> RGB565 */                                                       \
-    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
-    __asm movdqa     xmm2, xmm0    /* G */                                     \
-    __asm pslld      xmm0, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm0, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm0, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm0, xmm3    /* BGR */                                   \
-    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
-    __asm movdqa     xmm2, xmm1    /* G */                                     \
-    __asm pslld      xmm1, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm1, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm1, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
+    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0 /* G */                                     \
+    __asm pslld      xmm0, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm0, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm0, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm0, xmm3 /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1 /* G */                                     \
+    __asm pslld      xmm1, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm1, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm1, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm1, xmm3 /* BGR */                                   \
     __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
-    __asm lea        edx, [edx + 16]                                           \
-  }
+    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]}
 
 // 8 pixels.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void I444ToARGBRow_SSSE3(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUV444
@@ -2663,19 +2587,19 @@
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked)
-void I422ToRGB24Row_SSSE3(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
+__declspec(naked) void I422ToRGB24Row_SSSE3(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    uint8* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
@@ -2701,30 +2625,30 @@
 
 // 8 pixels
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked)
-void I422ToRGB565Row_SSSE3(const uint8* y_buf,
-                           const uint8* u_buf,
-                           const uint8* v_buf,
-                           uint8* rgb565_buf,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
+__declspec(naked) void I422ToRGB565Row_SSSE3(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    uint8* rgb565_buf,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
+    pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
     psrld      xmm5, 27
-    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
+    pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
     psrld      xmm6, 26
     pslld      xmm6, 5
-    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
+    pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
     pslld      xmm7, 11
 
  convertloop:
@@ -2744,25 +2668,25 @@
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void I422ToARGBRow_SSSE3(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUV422
@@ -2781,21 +2705,21 @@
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
+__declspec(naked) void I422AlphaToARGBRow_SSSE3(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    const uint8* a_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
     push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
     mov        edi, [esp + 16 + 12]  // V
     mov        ebp, [esp + 16 + 16]  // A
     mov        edx, [esp + 16 + 20]  // argb
@@ -2820,62 +2744,22 @@
 }
 
 // 8 pixels.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked)
-void I411ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        edx, [esp + 16 + 16]  // abgr
-    mov        ebp, [esp + 16 + 20]  // yuvconstants
-    mov        ecx, [esp + 16 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV411_EBX
-    YUVTORGB(ebp)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* uv_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void NV12ToARGBRow_SSSE3(
+    const uint8* y_buf,
+    const uint8* uv_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // UV
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READNV12
@@ -2893,21 +2777,21 @@
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* vu_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void NV21ToARGBRow_SSSE3(
+    const uint8* y_buf,
+    const uint8* vu_buf,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // VU
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READNV21
@@ -2925,18 +2809,18 @@
 
 // 8 pixels.
 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void YUY2ToARGBRow_SSSE3(
+    const uint8* src_yuy2,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // yuy2
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUY2
@@ -2953,18 +2837,18 @@
 
 // 8 pixels.
 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void UYVYToARGBRow_SSSE3(
+    const uint8* src_uyvy,
+    uint8* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // uyvy
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READUYVY
@@ -2979,19 +2863,19 @@
   }
 }
 
-__declspec(naked)
-void I422ToRGBARow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_rgba,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void I422ToRGBARow_SSSE3(
+    const uint8* y_buf,
+    const uint8* u_buf,
+    const uint8* v_buf,
+    uint8* dst_rgba,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
@@ -3016,39 +2900,38 @@
 
 #ifdef HAS_I400TOARGBROW_SSE2
 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked)
-void I400ToARGBRow_SSE2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
+                                          uint8* rgb_buf,
+                                          int width) {
   __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
     movd       xmm2, eax
     pshufd     xmm2, xmm2,0
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
     movd       xmm3, eax
     pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
     pslld      xmm4, 24
 
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
 
  convertloop:
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+        // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     movq       xmm0, qword ptr [eax]
     lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm0           // Y.Y
+    punpcklbw  xmm0, xmm0  // Y.Y
     pmulhuw    xmm0, xmm2
     psubusw    xmm0, xmm3
     psrlw      xmm0, 6
-    packuswb   xmm0, xmm0           // G
+    packuswb   xmm0, xmm0        // G
 
     // Step 2: Weave into ARGB
-    punpcklbw  xmm0, xmm0           // GG
+    punpcklbw  xmm0, xmm0  // GG
     movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
+    punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
     por        xmm0, xmm4
     por        xmm1, xmm4
     movdqu     [edx], xmm0
@@ -3064,41 +2947,40 @@
 #ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked)
-void I400ToARGBRow_AVX2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
+                                          uint8* rgb_buf,
+                                          int width) {
   __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
     vmovd      xmm2, eax
     vbroadcastss ymm2, xmm2
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
     vmovd      xmm3, eax
     vbroadcastss ymm3, xmm3
-    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
     vpslld     ymm4, ymm4, 24
 
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
 
  convertloop:
-    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+        // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
     vmovdqu    xmm0, [eax]
     lea        eax, [eax + 16]
-    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
-    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
     vpmulhuw   ymm0, ymm0, ymm2
     vpsubusw   ymm0, ymm0, ymm3
     vpsrlw     ymm0, ymm0, 6
-    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+    vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
 
     // TODO(fbarchard): Weave alpha with unpack.
     // Step 2: Weave into ARGB
-    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+    vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
     vpermq     ymm1, ymm1, 0xd8
-    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
-    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
     vpor       ymm0, ymm0, ymm4
     vpor       ymm1, ymm1, ymm4
     vmovdqu    [edx], ymm0
@@ -3114,16 +2996,16 @@
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
 // TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked)
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_SSSE3(const uint8* src,
+                                       uint8* dst,
+                                       int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     movdqa    xmm5, xmmword ptr kShuffleMirror
 
@@ -3140,11 +3022,10 @@
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-__declspec(naked)
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
 
@@ -3164,17 +3045,17 @@
 
 #ifdef HAS_MIRRORUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
 
-__declspec(naked)
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
-                       int width) {
+__declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
+                                         uint8* dst_u,
+                                         uint8* dst_v,
+                                         int width) {
   __asm {
     push      edi
-    mov       eax, [esp + 4 + 4]   // src
-    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       eax, [esp + 4 + 4]  // src
+    mov       edx, [esp + 4 + 8]  // dst_u
     mov       edi, [esp + 4 + 12]  // dst_v
     mov       ecx, [esp + 4 + 16]  // width
     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
@@ -3198,11 +3079,12 @@
 #endif  // HAS_MIRRORUVROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked)
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
+                                          uint8* dst,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
 
@@ -3221,15 +3103,14 @@
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 
-__declspec(naked)
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
+                                          uint8* dst,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
 
@@ -3246,16 +3127,17 @@
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked)
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
+__declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
+                                       uint8* dst_u,
+                                       uint8* dst_v,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3265,10 +3147,10 @@
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
     movdqa     xmm3, xmm1
-    pand       xmm0, xmm5   // even bytes
+    pand       xmm0, xmm5  // even bytes
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
-    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm2, 8  // odd bytes
     psrlw      xmm3, 8
     packuswb   xmm2, xmm3
     movdqu     [edx], xmm0
@@ -3285,16 +3167,17 @@
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked)
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
+__declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
+                                       uint8* dst_u,
+                                       uint8* dst_v,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3302,9 +3185,9 @@
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm2, ymm0, 8      // odd bytes
+    vpsrlw     ymm2, ymm0, 8  // odd bytes
     vpsrlw     ymm3, ymm1, 8
-    vpand      ymm0, ymm0, ymm5   // even bytes
+    vpand      ymm0, ymm0, ymm5  // even bytes
     vpand      ymm1, ymm1, ymm5
     vpackuswb  ymm0, ymm0, ymm1
     vpackuswb  ymm2, ymm2, ymm3
@@ -3324,24 +3207,25 @@
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked)
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
+__declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
+                                       const uint8* src_v,
+                                       uint8* dst_uv,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
     sub        edx, eax
 
   convertloop:
-    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm0, [eax]  // read 16 U's
     movdqu     xmm1, [eax + edx]  // and 16 V's
     lea        eax,  [eax + 16]
     movdqa     xmm2, xmm0
-    punpcklbw  xmm0, xmm1       // first 8 UV pairs
-    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    punpcklbw  xmm0, xmm1  // first 8 UV pairs
+    punpckhbw  xmm2, xmm1  // next 8 UV pairs
     movdqu     [edi], xmm0
     movdqu     [edi + 16], xmm2
     lea        edi, [edi + 32]
@@ -3355,24 +3239,25 @@
 #endif  //  HAS_MERGEUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked)
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
+__declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
+                                       const uint8* src_v,
+                                       uint8* dst_uv,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
     sub        edx, eax
 
   convertloop:
-    vmovdqu    ymm0, [eax]           // read 32 U's
-    vmovdqu    ymm1, [eax + edx]     // and 32 V's
+    vmovdqu    ymm0, [eax]  // read 32 U's
+    vmovdqu    ymm1, [eax + edx]  // and 32 V's
     lea        eax,  [eax + 32]
-    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
-    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
-    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0  // bytes 0..15
     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
@@ -3389,11 +3274,10 @@
 
 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked)
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
     mov        ecx, [esp + 12]  // count
     test       eax, 15
     jne        convertloopu
@@ -3427,11 +3311,10 @@
 
 #ifdef HAS_COPYROW_AVX
 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked)
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
     mov        ecx, [esp + 12]  // count
 
   convertloop:
@@ -3451,13 +3334,12 @@
 #endif  // HAS_COPYROW_AVX
 
 // Multiple of 1.
-__declspec(naked)
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, esi
     mov        edx, edi
-    mov        esi, [esp + 4]   // src
-    mov        edi, [esp + 8]   // dst
+    mov        esi, [esp + 4]  // src
+    mov        edi, [esp + 8]  // dst
     mov        ecx, [esp + 12]  // count
     rep movsb
     mov        edi, edx
@@ -3468,15 +3350,16 @@
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
+                                             uint8* dst,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
     mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
     pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
     psrld      xmm1, 8
 
   convertloop:
@@ -3504,14 +3387,15 @@
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
+                                             uint8* dst,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
     mov        ecx, [esp + 12]  // count
     vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
 
   convertloop:
     vmovdqu    ymm1, [eax]
@@ -3533,11 +3417,12 @@
 
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
+                                                uint8* dst_a,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_a
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
     mov        ecx, [esp + 12]  // width
 
   extractloop:
@@ -3558,17 +3443,54 @@
 }
 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
 
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
+                                                uint8* dst_a,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
+    mov        ecx, [esp + 12]  // width
+    vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
+
+  extractloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpsrld     ymm0, ymm0, 24
+    vpsrld     ymm1, ymm1, 24
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    lea        eax, [eax + 128]
+    vpackssdw  ymm0, ymm0, ymm1  // mutates
+    vpsrld     ymm2, ymm2, 24
+    vpsrld     ymm3, ymm3, 24
+    vpackssdw  ymm2, ymm2, ymm3  // mutates
+    vpackuswb  ymm0, ymm0, ymm2  // mutates
+    vpermd     ymm0, ymm4, ymm0  // unmutate
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         extractloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
+                                                uint8* dst,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
     mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
     pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
     psrld      xmm1, 8
 
   convertloop:
@@ -3598,14 +3520,15 @@
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
+                                                uint8* dst,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
     mov        ecx, [esp + 12]  // count
     vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
 
   convertloop:
     vpmovzxbd  ymm1, qword ptr [eax]
@@ -3630,14 +3553,13 @@
 #ifdef HAS_SETROW_X86
 // Write 'count' bytes using an 8 bit value repeated.
 // Count should be multiple of 4.
-__declspec(naked)
-void SetRow_X86(uint8* dst, uint8 v8, int count) {
+__declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
   __asm {
-    movzx      eax, byte ptr [esp + 8]    // v8
+    movzx      eax, byte ptr [esp + 8]  // v8
     mov        edx, 0x01010101  // Duplicate byte to all bytes.
-    mul        edx              // overwrites edx with upper part of result.
+    mul        edx  // overwrites edx with upper part of result.
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
+    mov        edi, [esp + 4]  // dst
     mov        ecx, [esp + 12]  // count
     shr        ecx, 2
     rep stosd
@@ -3647,12 +3569,11 @@
 }
 
 // Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked)
-void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+__declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
   __asm {
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v8
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v8
     mov        ecx, [esp + 12]  // count
     rep stosb
     mov        edi, edx
@@ -3661,12 +3582,11 @@
 }
 
 // Write 'count' 32 bit values.
-__declspec(naked)
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+__declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
   __asm {
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v32
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v32
     mov        ecx, [esp + 12]  // count
     rep stosd
     mov        edi, edx
@@ -3676,12 +3596,13 @@
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked)
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
+                                       uint8* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
 
@@ -3689,9 +3610,9 @@
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // even bytes are Y
+    vpand      ymm0, ymm0, ymm5  // even bytes are Y
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -3702,18 +3623,20 @@
   }
 }
 
-__declspec(naked)
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8* dst_u,
+                                        uint8* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3723,18 +3646,18 @@
     vpavgb     ymm0, ymm0, [eax + esi]
     vpavgb     ymm1, ymm1, [eax + esi + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3746,16 +3669,17 @@
   }
 }
 
-__declspec(naked)
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                                           uint8* dst_u,
+                                           uint8* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3763,18 +3687,18 @@
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3785,21 +3709,21 @@
   }
 }
 
-__declspec(naked)
-void UYVYToYRow_AVX2(const uint8* src_uyvy,
-                     uint8* dst_y, int width) {
+__declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
+                                       uint8* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
+    vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -3810,18 +3734,20 @@
   }
 }
 
-__declspec(naked)
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8* dst_u,
+                                        uint8* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3831,18 +3757,18 @@
     vpavgb     ymm0, ymm0, [eax + esi]
     vpavgb     ymm1, ymm1, [eax + esi + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3854,16 +3780,17 @@
   }
 }
 
-__declspec(naked)
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                                           uint8* dst_u,
+                                           uint8* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3871,18 +3798,18 @@
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3895,21 +3822,21 @@
 #endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked)
-void YUY2ToYRow_SSE2(const uint8* src_yuy2,
-                     uint8* dst_y, int width) {
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+                                       uint8* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm0, xmm5  // even bytes are Y
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -3920,18 +3847,20 @@
   }
 }
 
-__declspec(naked)
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8* dst_u,
+                                        uint8* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3943,13 +3872,13 @@
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm0, 8  // YUYV -> UVUV
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -3963,16 +3892,17 @@
   }
 }
 
-__declspec(naked)
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                                           uint8* dst_u,
+                                           uint8* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3980,13 +3910,13 @@
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm0, 8  // YUYV -> UVUV
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -3999,19 +3929,19 @@
   }
 }
 
-__declspec(naked)
-void UYVYToYRow_SSE2(const uint8* src_uyvy,
-                     uint8* dst_y, int width) {
+__declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
+                                       uint8* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm0, 8  // odd bytes are Y
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -4022,18 +3952,20 @@
   }
 }
 
-__declspec(naked)
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8* dst_u,
+                                        uint8* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -4045,13 +3977,13 @@
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm0, xmm5  // UYVY -> UVUV
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -4065,16 +3997,17 @@
   }
 }
 
-__declspec(naked)
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                                           uint8* dst_u,
+                                           uint8* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -4082,13 +4015,13 @@
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm0, xmm5  // UYVY -> UVUV
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -4108,13 +4041,15 @@
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
+                                           const uint8* src1,
+                                           const uint8* alpha,
+                                           uint8* dst,
+                                           int width) {
   __asm {
     push       esi
     push       edi
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
     psllw      xmm5, 8
     mov        eax, 0x80808080  // 128 for biasing image to signed.
     movd       xmm6, eax
@@ -4123,8 +4058,8 @@
     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
     movd       xmm7, eax
     pshufd     xmm7, xmm7, 0x00
-    mov        eax, [esp + 8 + 4]   // src0
-    mov        edx, [esp + 8 + 8]   // src1
+    mov        eax, [esp + 8 + 4]  // src0
+    mov        edx, [esp + 8 + 8]  // src1
     mov        esi, [esp + 8 + 12]  // alpha
     mov        edi, [esp + 8 + 16]  // dst
     mov        ecx, [esp + 8 + 20]  // width
@@ -4134,15 +4069,15 @@
 
     // 8 pixel loop.
   convertloop8:
-    movq       xmm0, qword ptr [esi]        // alpha
+    movq       xmm0, qword ptr [esi]  // alpha
     punpcklbw  xmm0, xmm0
-    pxor       xmm0, xmm5         // a, 255-a
+    pxor       xmm0, xmm5  // a, 255-a
     movq       xmm1, qword ptr [eax + esi]  // src0
     movq       xmm2, qword ptr [edx + esi]  // src1
     punpcklbw  xmm1, xmm2
-    psubb      xmm1, xmm6         // bias src0/1 - 128
+    psubb      xmm1, xmm6  // bias src0/1 - 128
     pmaddubsw  xmm0, xmm1
-    paddw      xmm0, xmm7         // unbias result - 32768 and round.
+    paddw      xmm0, xmm7  // unbias result - 32768 and round.
     psrlw      xmm0, 8
     packuswb   xmm0, xmm0
     movq       qword ptr [edi + esi], xmm0
@@ -4163,13 +4098,15 @@
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
+                                          const uint8* src1,
+                                          const uint8* alpha,
+                                          uint8* dst,
+                                          int width) {
   __asm {
     push        esi
     push        edi
-    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
     vpsllw      ymm5, ymm5, 8
     mov         eax, 0x80808080  // 128 for biasing image to signed.
     vmovd       xmm6, eax
@@ -4177,8 +4114,8 @@
     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
     vmovd       xmm7, eax
     vbroadcastss ymm7, xmm7
-    mov         eax, [esp + 8 + 4]   // src0
-    mov         edx, [esp + 8 + 8]   // src1
+    mov         eax, [esp + 8 + 4]  // src0
+    mov         edx, [esp + 8 + 8]  // src1
     mov         esi, [esp + 8 + 12]  // alpha
     mov         edi, [esp + 8 + 16]  // dst
     mov         ecx, [esp + 8 + 20]  // width
@@ -4188,21 +4125,21 @@
 
     // 32 pixel loop.
   convertloop32:
-    vmovdqu     ymm0, [esi]        // alpha
-    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
-    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
-    vpxor       ymm3, ymm3, ymm5   // a, 255-a
-    vpxor       ymm0, ymm0, ymm5   // a, 255-a
+    vmovdqu     ymm0, [esi]  // alpha
+    vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
+    vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
+    vpxor       ymm3, ymm3, ymm5  // a, 255-a
+    vpxor       ymm0, ymm0, ymm5  // a, 255-a
     vmovdqu     ymm1, [eax + esi]  // src0
     vmovdqu     ymm2, [edx + esi]  // src1
     vpunpckhbw  ymm4, ymm1, ymm2
     vpunpcklbw  ymm1, ymm1, ymm2
-    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
-    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
+    vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
+    vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
     vpmaddubsw  ymm3, ymm3, ymm4
     vpmaddubsw  ymm0, ymm0, ymm1
-    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
-    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
+    vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
+    vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
     vpsrlw      ymm3, ymm3, 8
     vpsrlw      ymm0, ymm0, 8
     vpackuswb   ymm0, ymm0, ymm3
@@ -4221,52 +4158,51 @@
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time.
-__declspec(naked)
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
+                                          const uint8* src_argb1,
+                                          uint8* dst_argb,
+                                          int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
+    pcmpeqb    xmm7, xmm7  // generate constant 0x0001
     psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
     psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
     psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
     pslld      xmm4, 24
     sub        ecx, 4
-    jl         convertloop4b    // less than 4 pixels?
+    jl         convertloop4b  // less than 4 pixels?
 
     // 4 pixel loop.
   convertloop4:
-    movdqu     xmm3, [eax]      // src argb
+    movdqu     xmm3, [eax]  // src argb
     lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqu     xmm2, [esi]      // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqu     xmm1, [esi]      // _a_g
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movdqu     xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movdqu     xmm1, [esi]  // _a_g
     lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4278,24 +4214,24 @@
 
     // 1 pixel loop.
   convertloop1:
-    movd       xmm3, [eax]      // src argb
+    movd       xmm3, [eax]  // src argb
     lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movd       xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movd       xmm1, [esi]  // _a_g
     lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -4311,41 +4247,42 @@
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
 static const uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+    3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
 };
 static const uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+    11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+    15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
-__declspec(naked)
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
+                                              uint8* dst_argb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
     pslld      xmm3, 24
     movdqa     xmm4, xmmword ptr kShuffleAlpha0
     movdqa     xmm5, xmmword ptr kShuffleAlpha1
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
-    pshufb     xmm0, xmm4       // isolate first 2 alphas
-    movdqu     xmm1, [eax]      // read 4 pixels
-    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
-    pmulhuw    xmm0, xmm1       // rgb * a
-    movdqu     xmm1, [eax]      // read 4 pixels
-    pshufb     xmm1, xmm5       // isolate next 2 alphas
-    movdqu     xmm2, [eax]      // read 4 pixels
-    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
-    pmulhuw    xmm1, xmm2       // rgb * a
-    movdqu     xmm2, [eax]      // mask original alpha
+    movdqu     xmm0, [eax]  // read 4 pixels
+    pshufb     xmm0, xmm4  // isolate first 2 alphas
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1  // rgb * a
+    movdqu     xmm1, [eax]  // read 4 pixels
+    pshufb     xmm1, xmm5  // isolate next 2 alphas
+    movdqu     xmm2, [eax]  // read 4 pixels
+    punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2  // rgb * a
+    movdqu     xmm2, [eax]  // mask original alpha
     lea        eax, [eax + 16]
     pand       xmm2, xmm3
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    por        xmm0, xmm2       // copy original alpha
+    por        xmm0, xmm2  // copy original alpha
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4358,22 +4295,23 @@
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
-__declspec(naked)
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
+                                             uint8* dst_argb,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
-    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
     vpslld     ymm5, ymm5, 24
 
  convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
@@ -4398,40 +4336,40 @@
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-__declspec(naked)
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
+                                               uint8* dst_argb,
+                                               int width) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]   // src_argb
-    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
     mov        ecx, [esp + 12 + 12]  // width
     lea        ebx, fixed_invtbl8
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     movzx      esi, byte ptr [eax + 3]  // first alpha
     movzx      edi, byte ptr [eax + 7]  // second alpha
-    punpcklbw  xmm0, xmm0       // first 2
+    punpcklbw  xmm0, xmm0  // first 2
     movd       xmm2, dword ptr [ebx + esi * 4]
     movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
     movlhps    xmm2, xmm3
-    pmulhuw    xmm0, xmm2       // rgb * a
+    pmulhuw    xmm0, xmm2  // rgb * a
 
-    movdqu     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]  // read 4 pixels
     movzx      esi, byte ptr [eax + 11]  // third alpha
     movzx      edi, byte ptr [eax + 15]  // forth alpha
-    punpckhbw  xmm1, xmm1       // next 2
+    punpckhbw  xmm1, xmm1  // next 2
     movd       xmm2, dword ptr [ebx + esi * 4]
     movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
     movlhps    xmm2, xmm3
-    pmulhuw    xmm1, xmm2       // rgb * a
+    pmulhuw    xmm1, xmm2  // rgb * a
     lea        eax, [eax + 16]
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -4450,25 +4388,24 @@
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 // USE_GATHER is not on by default, due to being a slow instruction.
 #ifdef USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
+                                               uint8* dst_argb,
+                                               int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
 
  convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
-    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
@@ -4488,50 +4425,50 @@
     ret
   }
 }
-#else  // USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+#else   // USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
+                                               uint8* dst_argb,
+                                               int width) {
   __asm {
 
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]   // src_argb
-    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
     mov        ecx, [esp + 12 + 12]  // width
     sub        edx, eax
     lea        ebx, fixed_invtbl8
     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
 
  convertloop:
-    // replace VPGATHER
-    movzx      esi, byte ptr [eax + 3]                 // alpha0
-    movzx      edi, byte ptr [eax + 7]                 // alpha1
+        // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]  // alpha0
+    movzx      edi, byte ptr [eax + 7]  // alpha1
     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
-    movzx      esi, byte ptr [eax + 11]                // alpha2
-    movzx      edi, byte ptr [eax + 15]                // alpha3
-    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
+    movzx      esi, byte ptr [eax + 11]  // alpha2
+    movzx      edi, byte ptr [eax + 15]  // alpha3
+    vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
-    movzx      esi, byte ptr [eax + 19]                // alpha4
-    movzx      edi, byte ptr [eax + 23]                // alpha5
-    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
+    movzx      esi, byte ptr [eax + 19]  // alpha4
+    movzx      edi, byte ptr [eax + 23]  // alpha5
+    vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
-    movzx      esi, byte ptr [eax + 27]                // alpha6
-    movzx      edi, byte ptr [eax + 31]                // alpha7
-    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
+    movzx      esi, byte ptr [eax + 27]  // alpha6
+    movzx      edi, byte ptr [eax + 31]  // alpha7
+    vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
-    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
-    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
-    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
-    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+    vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
     // end of VPGATHER
 
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
@@ -4540,7 +4477,7 @@
     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpackuswb  ymm0, ymm0, ymm1             // unmutated.
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
     sub        ecx, 8
@@ -4558,12 +4495,13 @@
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked)
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb,
+                                         uint8* dst_argb,
+                                         int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kARGBToYJ
     movdqa     xmm5, xmmword ptr kAddYJ64
 
@@ -4575,20 +4513,20 @@
     phaddw     xmm0, xmm1
     paddw      xmm0, xmm5  // Add .5 for rounding.
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 G bytes
+    packuswb   xmm0, xmm0  // 8 G bytes
     movdqu     xmm2, [eax]  // A
     movdqu     xmm3, [eax + 16]
     lea        eax, [eax + 32]
     psrld      xmm2, 24
     psrld      xmm3, 24
     packuswb   xmm2, xmm3
-    packuswb   xmm2, xmm2   // 8 A bytes
-    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
-    punpcklbw  xmm0, xmm0   // 8 GG words
-    punpcklbw  xmm3, xmm2   // 8 GA words
+    packuswb   xmm2, xmm2  // 8 A bytes
+    movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0  // 8 GG words
+    punpcklbw  xmm3, xmm2  // 8 GA words
     movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm3   // GGGA first 4
-    punpckhwd  xmm1, xmm3   // GGGA next 4
+    punpcklwd  xmm0, xmm3  // GGGA first 4
+    punpckhwd  xmm1, xmm3  // GGGA next 4
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
@@ -4604,24 +4542,20 @@
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 // Constant for ARGB color to sepia tone.
-static const vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
 
-static const vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
 
-static const vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked)
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
   __asm {
-    mov        eax, [esp + 4]   /* dst_argb */
-    mov        ecx, [esp + 8]   /* width */
+    mov        eax, [esp + 4] /* dst_argb */
+    mov        ecx, [esp + 8] /* width */
     movdqa     xmm2, xmmword ptr kARGBToSepiaB
     movdqa     xmm3, xmmword ptr kARGBToSepiaG
     movdqa     xmm4, xmmword ptr kARGBToSepiaR
@@ -4633,32 +4567,32 @@
     pmaddubsw  xmm6, xmm2
     phaddw     xmm0, xmm6
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm0, xmm0  // 8 B values
     movdqu     xmm5, [eax]  // G
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm3
     pmaddubsw  xmm1, xmm3
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
+    packuswb   xmm5, xmm5  // 8 G values
+    punpcklbw  xmm0, xmm5  // 8 BG values
     movdqu     xmm5, [eax]  // R
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
+    packuswb   xmm5, xmm5  // 8 R values
     movdqu     xmm6, [eax]  // A
     movdqu     xmm1, [eax + 16]
     psrld      xmm6, 24
     psrld      xmm1, 24
     packuswb   xmm6, xmm1
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm5, xmm6   // 8 RA values
-    movdqa     xmm1, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm5   // BGRA first 4
-    punpckhwd  xmm1, xmm5   // BGRA next 4
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm5, xmm6  // 8 RA values
+    movdqa     xmm1, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm5  // BGRA first 4
+    punpckhwd  xmm1, xmm5  // BGRA next 4
     movdqu     [eax], xmm0
     movdqu     [eax + 16], xmm1
     lea        eax, [eax + 32]
@@ -4674,19 +4608,20 @@
 // Same as Sepia except matrix is provided.
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked)
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
+                                                uint8* dst_argb,
+                                                const int8* matrix_argb,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* matrix_argb */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* matrix_argb */
     movdqu     xmm5, [ecx]
     pshufd     xmm2, xmm5, 0x00
     pshufd     xmm3, xmm5, 0x55
     pshufd     xmm4, xmm5, 0xaa
     pshufd     xmm5, xmm5, 0xff
-    mov        ecx, [esp + 16]  /* width */
+    mov        ecx, [esp + 16] /* width */
 
  convertloop:
     movdqu     xmm0, [eax]  // B
@@ -4697,31 +4632,31 @@
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7   // B
-    phaddsw    xmm6, xmm1   // G
-    psraw      xmm0, 6      // B
-    psraw      xmm6, 6      // G
-    packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm6, xmm6   // 8 G values
-    punpcklbw  xmm0, xmm6   // 8 BG values
+    phaddsw    xmm0, xmm7  // B
+    phaddsw    xmm6, xmm1  // G
+    psraw      xmm0, 6  // B
+    psraw      xmm6, 6  // G
+    packuswb   xmm0, xmm0  // 8 B values
+    packuswb   xmm6, xmm6  // 8 G values
+    punpcklbw  xmm0, xmm6  // 8 BG values
     movdqu     xmm1, [eax]  // R
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7   // R
+    phaddsw    xmm1, xmm7  // R
     movdqu     xmm6, [eax]  // A
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm6, xmm5
     pmaddubsw  xmm7, xmm5
-    phaddsw    xmm6, xmm7   // A
-    psraw      xmm1, 6      // R
-    psraw      xmm6, 6      // A
-    packuswb   xmm1, xmm1   // 8 R values
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm1, xmm6   // 8 RA values
-    movdqa     xmm6, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm1   // BGRA first 4
-    punpckhwd  xmm6, xmm1   // BGRA next 4
+    phaddsw    xmm6, xmm7  // A
+    psraw      xmm1, 6  // R
+    psraw      xmm6, 6  // A
+    packuswb   xmm1, xmm1  // 8 R values
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm1, xmm6  // 8 RA values
+    movdqa     xmm6, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm1  // BGRA first 4
+    punpckhwd  xmm6, xmm1  // BGRA next 4
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm6
     lea        eax, [eax + 32]
@@ -4735,15 +4670,17 @@
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked)
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+                                            int scale,
+                                            int interval_size,
+                                            int interval_offset,
+                                            int width) {
   __asm {
-    mov        eax, [esp + 4]    /* dst_argb */
-    movd       xmm2, [esp + 8]   /* scale */
-    movd       xmm3, [esp + 12]  /* interval_size */
-    movd       xmm4, [esp + 16]  /* interval_offset */
-    mov        ecx, [esp + 20]   /* width */
+    mov        eax, [esp + 4] /* dst_argb */
+    movd       xmm2, [esp + 8] /* scale */
+    movd       xmm3, [esp + 12] /* interval_size */
+    movd       xmm4, [esp + 16] /* interval_offset */
+    mov        ecx, [esp + 20] /* width */
     pshuflw    xmm2, xmm2, 040h
     pshufd     xmm2, xmm2, 044h
     pshuflw    xmm3, xmm3, 040h
@@ -4756,16 +4693,16 @@
 
  convertloop:
     movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5   // first 2 pixels
-    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    punpcklbw  xmm0, xmm5  // first 2 pixels
+    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
     movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5   // next 2 pixels
+    punpckhbw  xmm1, xmm5  // next 2 pixels
     pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3   // * interval_size
+    pmullw     xmm0, xmm3  // * interval_size
     movdqu     xmm7, [eax]  // read 4 pixels
     pmullw     xmm1, xmm3
-    pand       xmm7, xmm6   // mask alpha
-    paddw      xmm0, xmm4   // + interval_size / 2
+    pand       xmm7, xmm6  // mask alpha
+    paddw      xmm0, xmm4  // + interval_size / 2
     paddw      xmm1, xmm4
     packuswb   xmm0, xmm1
     por        xmm0, xmm7
@@ -4780,25 +4717,26 @@
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-__declspec(naked)
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
+                                         uint8* dst_argb,
+                                         int width,
+                                         uint32 value) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     movd       xmm2, [esp + 16]  // value
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0       // first 2
-    punpckhbw  xmm1, xmm1       // next 2
-    pmulhuw    xmm0, xmm2       // argb * value
-    pmulhuw    xmm1, xmm2       // argb * value
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    pmulhuw    xmm0, xmm2  // argb * value
+    pmulhuw    xmm1, xmm2  // argb * value
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
@@ -4814,28 +4752,29 @@
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
+                                            const uint8* src_argb1,
+                                            uint8* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
     movdqu     xmm1, xmm0
     movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0         // first 2
-    punpckhbw  xmm1, xmm1         // next 2
-    punpcklbw  xmm2, xmm5         // first 2
-    punpckhbw  xmm3, xmm5         // next 2
-    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    punpcklbw  xmm2, xmm5  // first 2
+    punpckhbw  xmm3, xmm5  // next 2
+    pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
     lea        eax, [eax + 16]
     lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
@@ -4853,13 +4792,14 @@
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked)
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
+                                       const uint8* src_argb1,
+                                       uint8* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
@@ -4867,11 +4807,11 @@
     jl         convertloop49
 
  convertloop4:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4882,11 +4822,11 @@
     jl         convertloop19
 
  convertloop1:
-    movd       xmm0, [eax]        // read 1 pixels from src_argb0
+    movd       xmm0, [eax]  // read 1 pixels from src_argb0
     lea        eax, [eax + 4]
-    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    movd       xmm1, [esi]  // read 1 pixels from src_argb1
     lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -4901,22 +4841,23 @@
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
+                                            const uint8* src_argb1,
+                                            uint8* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
+    psubusb    xmm0, xmm1  // src_argb0 - src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4930,28 +4871,29 @@
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
+                                            const uint8* src_argb1,
+                                            uint8* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5     // constant 0
+    vpxor      ymm5, ymm5, ymm5  // constant 0
 
  convertloop:
-    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
     lea        esi, [esi + 32]
-    vpunpcklbw ymm0, ymm1, ymm1   // low 4
-    vpunpckhbw ymm1, ymm1, ymm1   // high 4
-    vpunpcklbw ymm2, ymm3, ymm5   // low 4
-    vpunpckhbw ymm3, ymm3, ymm5   // high 4
-    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
-    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpunpcklbw ymm0, ymm1, ymm1  // low 4
+    vpunpckhbw ymm1, ymm1, ymm1  // high 4
+    vpunpcklbw ymm2, ymm3, ymm5  // low 4
+    vpunpckhbw ymm3, ymm3, ymm5  // high 4
+    vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
     vpackuswb  ymm0, ymm0, ymm1
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4967,20 +4909,21 @@
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
+                                       const uint8* src_argb1,
+                                       uint8* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4996,20 +4939,21 @@
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
+                                            const uint8* src_argb1,
+                                            uint8* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -5028,14 +4972,16 @@
 // -1  0  1
 // -2  0  2
 // -1  0  1
-__declspec(naked)
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+__declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
+                                      const uint8* src_y1,
+                                      const uint8* src_y2,
+                                      uint8* dst_sobelx,
+                                      int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_y0
-    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        eax, [esp + 8 + 4]  // src_y0
+    mov        esi, [esp + 8 + 8]  // src_y1
     mov        edi, [esp + 8 + 12]  // src_y2
     mov        edx, [esp + 8 + 16]  // dst_sobelx
     mov        ecx, [esp + 8 + 20]  // width
@@ -5045,17 +4991,17 @@
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5063,7 +5009,7 @@
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
@@ -5084,13 +5030,14 @@
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-__declspec(naked)
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
+__declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
+                                      const uint8* src_y1,
+                                      uint8* dst_sobely,
+                                      int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_y0
-    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        eax, [esp + 4 + 4]  // src_y0
+    mov        esi, [esp + 4 + 8]  // src_y1
     mov        edx, [esp + 4 + 12]  // dst_sobely
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5098,17 +5045,17 @@
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5116,7 +5063,7 @@
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
@@ -5137,36 +5084,37 @@
 // R = Sobel
 // G = Sobel
 // B = Sobel
-__declspec(naked)
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
+__declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
+                                     const uint8* src_sobely,
+                                     uint8* dst_argb,
+                                     int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
-    pslld      xmm5, 24             // 0xff000000
+    pcmpeqb    xmm5, xmm5  // alpha 255
+    pslld      xmm5, 24  // 0xff000000
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm2, xmm0             // GG
-    punpcklbw  xmm2, xmm0             // First 8
-    punpckhbw  xmm0, xmm0             // Next 8
-    movdqa     xmm1, xmm2             // GGGG
-    punpcklwd  xmm1, xmm2             // First 4
-    punpckhwd  xmm2, xmm2             // Next 4
-    por        xmm1, xmm5             // GGGA
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0  // GG
+    punpcklbw  xmm2, xmm0  // First 8
+    punpckhbw  xmm0, xmm0  // Next 8
+    movdqa     xmm1, xmm2  // GGGG
+    punpcklwd  xmm1, xmm2  // First 4
+    punpckhwd  xmm2, xmm2  // Next 4
+    por        xmm1, xmm5  // GGGA
     por        xmm2, xmm5
-    movdqa     xmm3, xmm0             // GGGG
-    punpcklwd  xmm3, xmm0             // Next 4
-    punpckhwd  xmm0, xmm0             // Last 4
-    por        xmm3, xmm5             // GGGA
+    movdqa     xmm3, xmm0  // GGGG
+    punpcklwd  xmm3, xmm0  // Next 4
+    punpckhwd  xmm0, xmm0  // Last 4
+    por        xmm3, xmm5  // GGGA
     por        xmm0, xmm5
     movdqu     [edx], xmm1
     movdqu     [edx + 16], xmm2
@@ -5184,22 +5132,23 @@
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked)
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
+                                            const uint8* src_sobely,
+                                            uint8* dst_y,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
@@ -5217,36 +5166,37 @@
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-__declspec(naked)
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
+                                       const uint8* src_sobely,
+                                       uint8* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
+    pcmpeqb    xmm5, xmm5  // alpha 255
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     movdqa     xmm2, xmm0
-    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm3, xmm0             // XA
+    paddusb    xmm2, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0  // XA
     punpcklbw  xmm3, xmm5
     punpckhbw  xmm0, xmm5
-    movdqa     xmm4, xmm1             // YS
+    movdqa     xmm4, xmm1  // YS
     punpcklbw  xmm4, xmm2
     punpckhbw  xmm1, xmm2
-    movdqa     xmm6, xmm4             // YSXA
-    punpcklwd  xmm6, xmm3             // First 4
-    punpckhwd  xmm4, xmm3             // Next 4
-    movdqa     xmm7, xmm1             // YSXA
-    punpcklwd  xmm7, xmm0             // Next 4
-    punpckhwd  xmm1, xmm0             // Last 4
+    movdqa     xmm6, xmm4  // YSXA
+    punpcklwd  xmm6, xmm3  // First 4
+    punpckhwd  xmm4, xmm3  // Next 4
+    movdqa     xmm7, xmm1  // YSXA
+    punpcklwd  xmm7, xmm0  // Next 4
+    punpckhwd  xmm1, xmm0  // Last 4
     movdqu     [edx], xmm6
     movdqu     [edx + 16], xmm4
     movdqu     [edx + 32], xmm7
@@ -5275,8 +5225,11 @@
 // count is number of averaged pixels to produce.
 // Does 4 pixels at a time.
 // This function requires alignment on accumulation buffer pointers.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
+void CumulativeSumToAverageRow_SSE2(const int32* topleft,
+                                    const int32* botleft,
+                                    int width,
+                                    int area,
+                                    uint8* dst,
                                     int count) {
   __asm {
     mov        eax, topleft  // eax topleft
@@ -5294,18 +5247,18 @@
     cmp        area, 128  // 128 pixels will not overflow 15 bits.
     ja         l4
 
-    pshufd     xmm5, xmm5, 0        // area
-    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    pshufd     xmm5, xmm5, 0  // area
+    pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
     psrld      xmm6, 16
     cvtdq2ps   xmm6, xmm6
-    addps      xmm5, xmm6           // (65536.0 + area - 1)
-    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
-    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
-    packssdw   xmm5, xmm5           // 16 bit shorts
+    addps      xmm5, xmm6  // (65536.0 + area - 1)
+    mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5  // 0.16 fixed point
+    packssdw   xmm5, xmm5  // 16 bit shorts
 
     // 4 pixel loop small blocks.
   s4:
-    // top left
+        // top left
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
@@ -5347,7 +5300,7 @@
 
     // 4 pixel loop
   l4:
-    // top left
+        // top left
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
@@ -5373,7 +5326,7 @@
     paddd      xmm3, [esi + edx * 4 + 48]
     lea        esi, [esi + 64]
 
-    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
     cvtdq2ps   xmm1, xmm1
     mulps      xmm0, xmm4
     mulps      xmm1, xmm4
@@ -5422,8 +5375,10 @@
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
+void ComputeCumulativeSumRow_SSE2(const uint8* row,
+                                  int32* cumsum,
+                                  const int32* previous_cumsum,
+                                  int width) {
   __asm {
     mov        eax, row
     mov        edx, cumsum
@@ -5505,10 +5460,11 @@
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked)
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width) {
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
+                                                     int src_argb_stride,
+                                                     uint8* dst_argb,
+                                                     const float* uv_dudv,
+                                                     int width) {
   __asm {
     push       esi
     push       edi
@@ -5519,7 +5475,7 @@
     movq       xmm2, qword ptr [ecx]  // uv
     movq       xmm7, qword ptr [ecx + 8]  // dudv
     mov        ecx, [esp + 28]  // width
-    shl        esi, 16          // 4, stride
+    shl        esi, 16  // 4, stride
     add        esi, 4
     movd       xmm5, esi
     sub        ecx, 4
@@ -5528,37 +5484,37 @@
     // setup for 4 pixel loop
     pshufd     xmm7, xmm7, 0x44  // dup dudv
     pshufd     xmm5, xmm5, 0  // dup 4, stride
-    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    movdqa     xmm0, xmm2  // x0, y0, x1, y1
     addps      xmm0, xmm7
     movlhps    xmm2, xmm0
     movdqa     xmm4, xmm7
-    addps      xmm4, xmm4    // dudv *= 2
-    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm4, xmm4  // dudv *= 2
+    movdqa     xmm3, xmm2  // x2, y2, x3, y3
     addps      xmm3, xmm4
-    addps      xmm4, xmm4    // dudv *= 4
+    addps      xmm4, xmm4  // dudv *= 4
 
     // 4 pixel loop
   l4:
-    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
-    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
-    packssdw   xmm0, xmm1    // x, y as 8 shorts
-    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    cvttps2dq  xmm0, xmm2  // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3  // x, y float to int next 2
+    packssdw   xmm0, xmm1  // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
     movd       esi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       edi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       xmm1, [eax + esi]  // read pixel 0
     movd       xmm6, [eax + edi]  // read pixel 1
-    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
-    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    punpckldq  xmm1, xmm6  // combine pixel 0 and 1
+    addps      xmm2, xmm4  // x, y += dx, dy first 2
     movq       qword ptr [edx], xmm1
     movd       esi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       edi, xmm0
     movd       xmm6, [eax + esi]  // read pixel 2
     movd       xmm0, [eax + edi]  // read pixel 3
-    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
-    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    punpckldq  xmm6, xmm0  // combine pixel 2 and 3
+    addps      xmm3, xmm4  // x, y += dx, dy next 2
     movq       qword ptr 8[edx], xmm6
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -5570,10 +5526,10 @@
 
     // 1 pixel loop
   l1:
-    cvttps2dq  xmm0, xmm2    // x, y float to int
-    packssdw   xmm0, xmm0    // x, y as shorts
-    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
-    addps      xmm2, xmm7    // x, y += dx, dy
+    cvttps2dq  xmm0, xmm2  // x, y float to int
+    packssdw   xmm0, xmm0  // x, y as shorts
+    pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
+    addps      xmm2, xmm7  // x, y += dx, dy
     movd       esi, xmm0
     movd       xmm0, [eax + esi]  // copy a pixel
     movd       [edx], xmm0
@@ -5590,15 +5546,16 @@
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
-__declspec(naked)
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
+__declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
+                                           const uint8* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           int dst_width,
+                                           int source_y_fraction) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -5607,7 +5564,7 @@
     je         xloop100  // 0 / 256.  Blend 100 / 0.
     sub        edi, esi
     cmp        eax, 128
-    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
 
     vmovd      xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5634,14 +5591,14 @@
     vpaddw     ymm0, ymm0, ymm4
     vpsrlw     ymm1, ymm1, 8
     vpsrlw     ymm0, ymm0, 8
-    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vpackuswb  ymm0, ymm0, ymm1            // unmutates
     vmovdqu    [esi + edi], ymm0
     lea        esi, [esi + 32]
     sub        ecx, 32
     jg         xloop
     jmp        xloop99
 
-   // Blend 50 / 50.
+    // Blend 50 / 50.
  xloop50:
    vmovdqu    ymm0, [esi]
    vpavgb     ymm0, ymm0, [esi + edx]
@@ -5651,7 +5608,7 @@
    jg         xloop50
    jmp        xloop99
 
-   // Blend 100 / 0 - Copy row unchanged.
+    // Blend 100 / 0 - Copy row unchanged.
  xloop100:
    rep movsb
 
@@ -5666,16 +5623,17 @@
 
 // Bilinear filter 16x2 -> 16x1
 // TODO(fbarchard): Consider allowing 256 using memcpy.
-__declspec(naked)
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
+__declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
+                                            const uint8* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            int dst_width,
+                                            int source_y_fraction) {
   __asm {
     push       esi
     push       edi
 
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -5684,7 +5642,7 @@
     cmp        eax, 0
     je         xloop100  // 0 /256.  Blend 100 / 0.
     cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
 
     movd       xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5703,7 +5661,7 @@
     movdqu     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
-    psubb      xmm0, xmm4  // bias image by -128
+    psubb      xmm0, xmm4            // bias image by -128
     psubb      xmm1, xmm4
     movdqa     xmm2, xmm5
     movdqa     xmm3, xmm5
@@ -5747,15 +5705,16 @@
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked)
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width) {
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
+                                            uint8* dst_argb,
+                                            const uint8* shuffler,
+                                            int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_argb
-    mov        ecx, [esp + 12]   // shuffler
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
     movdqu     xmm5, [ecx]
-    mov        ecx, [esp + 16]   // width
+    mov        ecx, [esp + 16]  // width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -5773,15 +5732,16 @@
 }
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked)
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
+                                           uint8* dst_argb,
+                                           const uint8* shuffler,
+                                           int width) {
   __asm {
-    mov        eax, [esp + 4]     // src_argb
-    mov        edx, [esp + 8]     // dst_argb
-    mov        ecx, [esp + 12]    // shuffler
-    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
-    mov        ecx, [esp + 16]    // width
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
+    vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
+    mov        ecx, [esp + 16]  // width
 
   wloop:
     vmovdqu    ymm0, [eax]
@@ -5801,19 +5761,20 @@
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-__declspec(naked)
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
+__declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
+                                           uint8* dst_argb,
+                                           const uint8* shuffler,
+                                           int width) {
   __asm {
     push       ebx
     push       esi
-    mov        eax, [esp + 8 + 4]    // src_argb
-    mov        edx, [esp + 8 + 8]    // dst_argb
-    mov        esi, [esp + 8 + 12]   // shuffler
-    mov        ecx, [esp + 8 + 16]   // width
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        edx, [esp + 8 + 8]  // dst_argb
+    mov        esi, [esp + 8 + 12]  // shuffler
+    mov        ecx, [esp + 8 + 16]  // width
     pxor       xmm5, xmm5
 
-    mov        ebx, [esi]   // shuffler
+    mov        ebx, [esi]  // shuffler
     cmp        ebx, 0x03000102
     je         shuf_3012
     cmp        ebx, 0x00010203
@@ -5823,7 +5784,7 @@
     cmp        ebx, 0x02010003
     je         shuf_2103
 
-  // TODO(fbarchard): Use one source pointer and 3 offsets.
+    // TODO(fbarchard): Use one source pointer and 3 offsets.
   shuf_any1:
     movzx      ebx, byte ptr [esi]
     movzx      ebx, byte ptr [eax + ebx]
@@ -5849,7 +5810,7 @@
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm5
     punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
+    pshufhw    xmm0, xmm0, 01Bh  // 1B = 00011011 = 0x0123 = BGRAToARGB
     pshuflw    xmm0, xmm0, 01Bh
     pshufhw    xmm1, xmm1, 01Bh
     pshuflw    xmm1, xmm1, 01Bh
@@ -5866,7 +5827,7 @@
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm5
     punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
+    pshufhw    xmm0, xmm0, 039h  // 39 = 00111001 = 0x0321 = RGBAToARGB
     pshuflw    xmm0, xmm0, 039h
     pshufhw    xmm1, xmm1, 039h
     pshuflw    xmm1, xmm1, 039h
@@ -5883,7 +5844,7 @@
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm5
     punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
+    pshufhw    xmm0, xmm0, 093h  // 93 = 10010011 = 0x2103 = ARGBToRGBA
     pshuflw    xmm0, xmm0, 093h
     pshufhw    xmm1, xmm1, 093h
     pshuflw    xmm1, xmm1, 093h
@@ -5900,7 +5861,7 @@
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm5
     punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
+    pshufhw    xmm0, xmm0, 0C6h  // C6 = 11000110 = 0x3012 = ABGRToARGB
     pshuflw    xmm0, xmm0, 0C6h
     pshufhw    xmm1, xmm1, 0C6h
     pshuflw    xmm1, xmm1, 0C6h
@@ -5923,30 +5884,30 @@
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
 
-__declspec(naked)
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
+                                          const uint8* src_u,
+                                          const uint8* src_v,
+                                          uint8* dst_frame,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2 // YUYV
+    punpcklbw  xmm0, xmm2  // YUYV
     punpckhbw  xmm1, xmm2
     movdqu     [edi], xmm0
     movdqu     [edi + 16], xmm1
@@ -5960,30 +5921,30 @@
   }
 }
 
-__declspec(naked)
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
+                                          const uint8* src_u,
+                                          const uint8* src_v,
+                                          uint8* dst_frame,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
     movdqa     xmm1, xmm2
     lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0 // UYVY
+    punpcklbw  xmm1, xmm0  // UYVY
     punpckhbw  xmm2, xmm0
     movdqu     [edi], xmm1
     movdqu     [edi + 16], xmm2
@@ -5998,22 +5959,22 @@
 }
 
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked)
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                                              uint8* dst_argb,
+                                              const float* poly,
+                                              int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* src_argb */
-    mov        edx, [esp + 4 + 8]   /* dst_argb */
-    mov        esi, [esp + 4 + 12]  /* poly */
-    mov        ecx, [esp + 4 + 16]  /* width */
+    mov        eax, [esp + 4 + 4] /* src_argb */
+    mov        edx, [esp + 4 + 8] /* dst_argb */
+    mov        esi, [esp + 4 + 12] /* poly */
+    mov        ecx, [esp + 4 + 16] /* width */
     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 
     // 2 pixel loop.
  convertloop:
-//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
     movq       xmm0, qword ptr [eax]  // BGRABGRA
     lea        eax, [eax + 8]
     punpcklbw  xmm0, xmm3
@@ -6057,25 +6018,25 @@
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked)
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                                              uint8* dst_argb,
+                                              const float* poly,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]   /* poly */
-    vbroadcastf128 ymm4, [ecx]       // C0
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* poly */
+    vbroadcastf128 ymm4, [ecx]  // C0
     vbroadcastf128 ymm5, [ecx + 16]  // C1
     vbroadcastf128 ymm6, [ecx + 32]  // C2
     vbroadcastf128 ymm7, [ecx + 48]  // C3
-    mov        ecx, [esp + 16]  /* width */
+    mov        ecx, [esp + 16] /* width */
 
     // 2 pixel loop.
  convertloop:
     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
     lea         eax, [eax + 8]
-    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vcvtdq2ps   ymm0, ymm0  // X 8 floats
     vmulps      ymm2, ymm0, ymm0  // X * X
     vmulps      ymm3, ymm0, ymm7  // C3 * X
     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
@@ -6095,16 +6056,125 @@
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kExpBias = 1.9259299444e-34f;
+__declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
+                                         uint16* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    mulss      xmm4, kExpBias
+    pshufd     xmm4, xmm4, 0
+    pxor       xmm5, xmm5
+    sub        edx, eax
+
+    // 8 pixel loop.
+ convertloop:
+    movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
+    add         eax, 16
+    movdqa      xmm3, xmm2
+    punpcklwd   xmm2, xmm5
+    cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
+    punpckhwd   xmm3, xmm5
+    cvtdq2ps    xmm3, xmm3
+    mulps       xmm2, xmm4
+    mulps       xmm3, xmm4
+    psrld       xmm2, 13
+    psrld       xmm3, 13
+    packssdw    xmm2, xmm3
+    movdqu      [eax + edx - 16], xmm2
+    sub         ecx, 8
+    jg          convertloop
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+__declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
+                                         uint16* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+
+    vmulss     xmm4, xmm4, kExpBias
+    vbroadcastss ymm4, xmm4
+    vpxor      ymm5, ymm5, ymm5
+    sub        edx, eax
+
+    // 16 pixel loop.
+ convertloop:
+    vmovdqu     ymm2, [eax]  // 16 shorts
+    add         eax, 32
+    vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
+    vpunpcklwd  ymm2, ymm2, ymm5
+    vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
+    vcvtdq2ps   ymm2, ymm2
+    vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
+    vmulps      ymm2, ymm2, ymm4
+    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
+    vpsrld      ymm2, ymm2, 13
+    vpackssdw   ymm2, ymm2, ymm3
+    vmovdqu     [eax + edx - 32], ymm2
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+__declspec(naked) void HalfFloatRow_F16C(const uint16* src,
+                                         uint16* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    vbroadcastss ymm4, [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    sub        edx, eax
+
+    // 16 pixel loop.
+ convertloop:
+    vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
+    vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
+    add         eax, 32
+    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
+    vcvtdq2ps   ymm3, ymm3
+    vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
+    vmulps      ymm3, ymm3, ymm4
+    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
+    vcvtps2ph   xmm3, ymm3, 3
+    vmovdqu     [eax + edx + 32], xmm2
+    vmovdqu     [eax + edx + 32 + 16], xmm3
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-__declspec(naked)
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
-                           int width) {
+__declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
+                                             const uint8* table_argb,
+                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
   convertloop:
@@ -6131,13 +6201,14 @@
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-__declspec(naked)
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+__declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
+                                            const uint8* table_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
   convertloop:
@@ -6162,27 +6233,28 @@
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-__declspec(naked)
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                 int width,
-                                 const uint8* luma, uint32 lumacoeff) {
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
+                                                   uint8* dst_argb,
+                                                   int width,
+                                                   const uint8* luma,
+                                                   uint32 lumacoeff) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   /* src_argb */
-    mov        edi, [esp + 8 + 8]   /* dst_argb */
-    mov        ecx, [esp + 8 + 12]  /* width */
+    mov        eax, [esp + 8 + 4] /* src_argb */
+    mov        edi, [esp + 8 + 8] /* dst_argb */
+    mov        ecx, [esp + 8 + 12] /* width */
     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
     pshufd     xmm2, xmm2, 0
     pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
     psllw      xmm4, 8
     pxor       xmm5, xmm5
 
     // 4 pixel loop.
   convertloop:
-    movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr
+    movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
     pmaddubsw  xmm0, xmm3
     phaddw     xmm0, xmm0
     pand       xmm0, xmm4  // mask out low bits

diff --git a/files/source/scale.cc b/files/source/scale.cc
index 36e3fe5..a5c7f7a 100644
--- a/files/source/scale.cc
+++ b/files/source/scale.cc

@@ -33,17 +33,24 @@
 // This is an optimized version for scaling down a plane to 1/2 of
 // its original size.
 
-static void ScalePlaneDown2(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown2(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8* src_ptr,
+                            uint8* dst_ptr,
                             enum FilterMode filtering) {
   int y;
   void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) =
-      filtering == kFilterNone ? ScaleRowDown2_C :
-      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+      filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear
+                                                        ? ScaleRowDown2Linear_C
+                                                        : ScaleRowDown2Box_C);
   int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
     src_stride = 0;
@@ -51,37 +58,47 @@
 
 #if defined(HAS_SCALEROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
-        ScaleRowDown2Box_Any_NEON);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
+                                          : ScaleRowDown2Box_Any_NEON);
     if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
-          ScaleRowDown2Box_NEON);
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_NEON
+                                                      : ScaleRowDown2Box_NEON);
     }
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
-        ScaleRowDown2Box_Any_SSSE3);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_SSSE3
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
+                                          : ScaleRowDown2Box_Any_SSSE3);
     if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
-          ScaleRowDown2Box_SSSE3);
+      ScaleRowDown2 =
+          filtering == kFilterNone
+              ? ScaleRowDown2_SSSE3
+              : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
+                                            : ScaleRowDown2Box_SSSE3);
     }
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
-        ScaleRowDown2Box_Any_AVX2);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_AVX2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
+                                          : ScaleRowDown2Box_Any_AVX2);
     if (IS_ALIGNED(dst_width, 32)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
-          ScaleRowDown2Box_AVX2);
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_AVX2
+                                                      : ScaleRowDown2Box_AVX2);
     }
   }
 #endif
@@ -89,8 +106,22 @@
   if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+    ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
+                                          : ScaleRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_MSA
+                                                      : ScaleRowDown2Box_MSA);
+    }
   }
 #endif
 
@@ -105,18 +136,25 @@
   }
 }
 
-static void ScalePlaneDown2_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown2_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16* src_ptr,
+                               uint16* dst_ptr,
                                enum FilterMode filtering) {
   int y;
   void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
                         uint16* dst_ptr, int dst_width) =
-    filtering == kFilterNone ? ScaleRowDown2_16_C :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
-        ScaleRowDown2Box_16_C);
+      filtering == kFilterNone
+          ? ScaleRowDown2_16_C
+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
+                                        : ScaleRowDown2Box_16_C);
   int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
     src_stride = 0;
@@ -124,23 +162,25 @@
 
 #if defined(HAS_SCALEROWDOWN2_16_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
-        ScaleRowDown2_16_NEON;
+    ScaleRowDown2 =
+        filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
-        ScaleRowDown2Box_16_SSE2);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_16_SSE2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
+                                          : ScaleRowDown2Box_16_SSE2);
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_16_DSPR2)
   if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
+    ScaleRowDown2 =
+        filtering ? ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
   }
 #endif
 
@@ -159,24 +199,30 @@
 // This is an optimized version for scaling down a plane to 1/4 of
 // its original size.
 
-static void ScalePlaneDown4(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown4(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8* src_ptr,
+                            uint8* dst_ptr,
                             enum FilterMode filtering) {
   int y;
   void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
   int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride * 2;  // Point to row 2.
     src_stride = 0;
   }
 #if defined(HAS_SCALEROWDOWN4_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
     if (IS_ALIGNED(dst_width, 8)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
     }
@@ -184,8 +230,8 @@
 #endif
 #if defined(HAS_SCALEROWDOWN4_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 8)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
     }
@@ -193,8 +239,8 @@
 #endif
 #if defined(HAS_SCALEROWDOWN4_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
     if (IS_ALIGNED(dst_width, 16)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
     }
@@ -204,8 +250,16 @@
   if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
+    }
   }
 #endif
 
@@ -219,38 +273,44 @@
   }
 }
 
-static void ScalePlaneDown4_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown4_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16* src_ptr,
+                               uint16* dst_ptr,
                                enum FilterMode filtering) {
   int y;
   void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
                         uint16* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
   int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride * 2;  // Point to row 2.
     src_stride = 0;
   }
 #if defined(HAS_SCALEROWDOWN4_16_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
-        ScaleRowDown4_16_NEON;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
   }
 #endif
 #if defined(HAS_SCALEROWDOWN4_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
-        ScaleRowDown4_16_SSE2;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
   }
 #endif
 #if defined(HAS_SCALEROWDOWN4_16_DSPR2)
   if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
   }
 #endif
 
@@ -265,11 +325,14 @@
 }
 
 // Scale plane down, 3/4
-
-static void ScalePlaneDown34(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown34(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8* src_ptr,
+                             uint8* dst_ptr,
                              enum FilterMode filtering) {
   int y;
   void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -277,6 +340,8 @@
   void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_C;
@@ -346,8 +411,7 @@
     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
@@ -363,10 +427,14 @@
   }
 }
 
-static void ScalePlaneDown34_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown34_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16* src_ptr,
+                                uint16* dst_ptr,
                                 enum FilterMode filtering) {
   int y;
   void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
@@ -374,6 +442,8 @@
   void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
                            uint16* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_16_C;
@@ -425,8 +495,7 @@
     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
@@ -442,7 +511,6 @@
   }
 }
 
-
 // Scale plane, 3/8
 // This is an optimized version for scaling down a plane to 3/8
 // of its original size.
@@ -458,10 +526,14 @@
 // ggghhhii
 // Boxes are 3x3, 2x3, 3x2 and 2x2
 
-static void ScalePlaneDown38(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown38(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8* src_ptr,
+                             uint8* dst_ptr,
                              enum FilterMode filtering) {
   int y;
   void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -470,6 +542,8 @@
                            uint8* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   assert(dst_width % 3 == 0);
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     ScaleRowDown38_3 = ScaleRowDown38_C;
     ScaleRowDown38_2 = ScaleRowDown38_C;
@@ -530,6 +604,26 @@
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN38_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_MSA;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
+      }
+    }
+  }
+#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -554,10 +648,14 @@
   }
 }
 
-static void ScalePlaneDown38_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown38_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16* src_ptr,
+                                uint16* dst_ptr,
                                 enum FilterMode filtering) {
   int y;
   void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
@@ -565,6 +663,8 @@
   void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
                            uint16* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown38_3 = ScaleRowDown38_16_C;
@@ -654,8 +754,12 @@
   return sum;
 }
 
-static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols2_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16* src_ptr,
+                            uint8* dst_ptr) {
   int i;
   int scaletbl[2];
   int minboxwidth = dx >> 16;
@@ -666,13 +770,18 @@
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
-        scaletbl[boxwidth - minboxwidth] >> 16;
+    *dst_ptr++ =
+        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
+        16;
   }
 }
 
-static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols2_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32* src_ptr,
+                               uint16* dst_ptr) {
   int i;
   int scaletbl[2];
   int minboxwidth = dx >> 16;
@@ -684,12 +793,17 @@
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
     *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
-        scaletbl[boxwidth - minboxwidth]  >> 16;
+                     scaletbl[boxwidth - minboxwidth] >>
+                 16;
   }
 }
 
-static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols0_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int,
+                            const uint16* src_ptr,
+                            uint8* dst_ptr) {
   int scaleval = 65536 / boxheight;
   int i;
   src_ptr += (x >> 16);
@@ -698,8 +812,12 @@
   }
 }
 
-static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols1_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16* src_ptr,
+                            uint8* dst_ptr) {
   int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
@@ -710,8 +828,12 @@
   }
 }
 
-static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols1_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32* src_ptr,
+                               uint16* dst_ptr) {
   int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
@@ -728,10 +850,14 @@
 // one pixel of destination using fixed point (16.16) to step
 // through source, sampling a box of pixel with simple
 // averaging.
-static void ScalePlaneBox(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneBox(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8* src_ptr,
+                          uint8* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -739,16 +865,16 @@
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   {
     // Allocate a row buffer of uint16.
     align_buffer_64(row16, src_width * 2);
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint16* src_ptr, uint8* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_C:
-        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+                         const uint16* src_ptr, uint8* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C
+                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
     void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
         ScaleAddRow_C;
 #if defined(HAS_SCALEADDROW_SSE2)
@@ -775,6 +901,22 @@
       }
     }
 #endif
+#if defined(HAS_SCALEADDROW_MSA)
+    if (TestCpuFlag(kCpuHasMSA)) {
+      ScaleAddRow = ScaleAddRow_Any_MSA;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_MSA;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_DSPR2)
+    if (TestCpuFlag(kCpuHasDSPR2)) {
+      ScaleAddRow = ScaleAddRow_Any_DSPR2;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_DSPR2;
+      }
+    }
+#endif
 
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
@@ -787,7 +929,7 @@
       boxheight = MIN1((y >> 16) - iy);
       memset(row16, 0, src_width * 2);
       for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        ScaleAddRow(src, (uint16*)(row16), src_width);
         src += src_stride;
       }
       ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
@@ -797,10 +939,14 @@
   }
 }
 
-static void ScalePlaneBox_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneBox_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16* src_ptr,
+                             uint16* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -808,15 +954,15 @@
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   {
     // Allocate a row buffer of uint32.
     align_buffer_64(row32, src_width * 4);
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint32* src_ptr, uint16* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+                         const uint32* src_ptr, uint16* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
     void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
         ScaleAddRow_16_C;
 
@@ -837,7 +983,7 @@
       boxheight = MIN1((y >> 16) - iy);
       memset(row32, 0, src_width * 4);
       for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        ScaleAddRow(src, (uint32*)(row32), src_width);
         src += src_stride;
       }
       ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
@@ -848,10 +994,14 @@
 }
 
 // Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearDown(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8* src_ptr,
+                            uint8* dst_ptr,
                             enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -864,14 +1014,14 @@
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
+                          int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -906,7 +1056,14 @@
     }
   }
 #endif
-
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -944,10 +1101,14 @@
   free_aligned_buffer_64(row);
 }
 
-void ScalePlaneBilinearDown_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearDown_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16* src_ptr,
+                               uint16* dst_ptr,
                                enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -960,14 +1121,14 @@
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
+                          int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1011,7 +1172,6 @@
   }
 #endif
 
-
 #if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleFilterCols = ScaleFilterCols_16_SSSE3;
@@ -1041,10 +1201,14 @@
 }
 
 // Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearUp(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8* src_ptr,
+                          uint8* dst_ptr,
                           enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1053,14 +1217,14 @@
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
+                          int x, int dx) =
       filtering ? ScaleFilterCols_C : ScaleCols_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -1172,10 +1336,14 @@
   }
 }
 
-void ScalePlaneBilinearUp_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearUp_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16* src_ptr,
+                             uint16* dst_ptr,
                              enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1184,14 +1352,14 @@
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
+                          int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1308,20 +1476,24 @@
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
 
-static void ScalePlaneSimple(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneSimple(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8* src_ptr,
+                             uint8* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_C;
+  void (*ScaleCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, int x,
+                    int dx) = ScaleCols_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
   int dx = 0;
   int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
   if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1340,20 +1512,24 @@
   }
 }
 
-static void ScalePlaneSimple_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneSimple_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16* src_ptr,
+                                uint16* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_16_C;
+  void (*ScaleCols)(uint16 * dst_ptr, const uint16* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_16_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
   int dx = 0;
   int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
   if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1366,8 +1542,7 @@
   }
 
   for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
-              dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -1377,14 +1552,18 @@
 // This function dispatches to a specialized scaler based on scale factor.
 
 LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
-                int src_width, int src_height,
-                uint8* dst, int dst_stride,
-                int dst_width, int dst_height,
+void ScalePlane(const uint8* src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8* dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering) {
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1403,46 +1582,42 @@
   if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
     // Arbitrary scale vertically, but unscaled horizontally.
-    ScalePlaneVertical(src_height,
-                       dst_width, dst_height,
-                       src_stride, dst_stride, src, dst,
-                       0, 0, dy, 1, filtering);
+    ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
       // optimized, 3/4
-      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
       return;
     }
     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
-      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
       return;
     }
     // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
-      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
       return;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
         (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
-      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
       return;
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
-                  src_stride, dst_stride, src, dst);
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
+                  dst_stride, src, dst);
     return;
   }
   if (filtering && dst_height > src_height) {
@@ -1455,19 +1630,23 @@
                            src_stride, dst_stride, src, dst, filtering);
     return;
   }
-  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                   src_stride, dst_stride, src, dst);
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
+                   dst_stride, src, dst);
 }
 
 LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                  int src_width, int src_height,
-                  uint16* dst, int dst_stride,
-                  int dst_width, int dst_height,
-                  enum FilterMode filtering) {
+void ScalePlane_16(const uint16* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   enum FilterMode filtering) {
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1486,16 +1665,13 @@
   if (dst_width == src_width) {
     int dy = FixedDiv(src_height, dst_height);
     // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical_16(src_height,
-                          dst_width, dst_height,
-                          src_stride, dst_stride, src, dst,
-                          0, 0, dy, 1, filtering);
+    ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
+                          dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
       // optimized, 3/4
       ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
@@ -1508,15 +1684,14 @@
       return;
     }
     // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
       ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
       return;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-               filtering != kFilterBilinear) {
+        filtering != kFilterBilinear) {
       // optimized, 1/4
       ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
                          src_stride, dst_stride, src, dst, filtering);
@@ -1524,8 +1699,8 @@
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst);
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
+                     dst_stride, src, dst);
     return;
   }
   if (filtering && dst_height > src_height) {
@@ -1538,101 +1713,121 @@
                               src_stride, dst_stride, src, dst, filtering);
     return;
   }
-  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst);
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst);
 }
 
 // Scale an I420 image.
 // This function in turn calls a scaling function for each plane.
 
 LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              int src_width, int src_height,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int dst_width, int dst_height,
+int I420Scale(const uint8* src_y,
+              int src_stride_y,
+              const uint8* src_u,
+              int src_stride_u,
+              const uint8* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8* dst_y,
+              int dst_stride_y,
+              uint8* dst_u,
+              int dst_stride_u,
+              uint8* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height,
-             dst_y, dst_stride_y, dst_width, dst_height,
-             filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-             filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-             filtering);
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+             dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+             dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   return 0;
 }
 
 LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
+int I420Scale_16(const uint16* src_y,
+                 int src_stride_y,
+                 const uint16* src_u,
+                 int src_stride_u,
+                 const uint16* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16* dst_y,
+                 int dst_stride_y,
+                 uint16* dst_u,
+                 int dst_stride_u,
+                 uint16* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
                  enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
-                dst_y, dst_stride_y, dst_width, dst_height,
-                filtering);
-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
-                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-                filtering);
-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
-                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-                filtering);
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   return 0;
 }
 
 // Deprecated api
 LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
-          int src_stride_y, int src_stride_u, int src_stride_v,
-          int src_width, int src_height,
-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
-          int dst_width, int dst_height,
+int Scale(const uint8* src_y,
+          const uint8* src_u,
+          const uint8* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8* dst_y,
+          uint8* dst_u,
+          uint8* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
           LIBYUV_BOOL interpolate) {
-  return I420Scale(src_y, src_stride_y,
-                   src_u, src_stride_u,
-                   src_v, src_stride_v,
-                   src_width, src_height,
-                   dst_y, dst_stride_y,
-                   dst_u, dst_stride_u,
-                   dst_v, dst_stride_v,
-                   dst_width, dst_height,
-                   interpolate ? kFilterBox : kFilterNone);
+  return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                   src_stride_v, src_width, src_height, dst_y, dst_stride_y,
+                   dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
+                   dst_height, interpolate ? kFilterBox : kFilterNone);
 }
 
 // Deprecated api
 LIBYUV_API
-int ScaleOffset(const uint8* src, int src_width, int src_height,
-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+int ScaleOffset(const uint8* src,
+                int src_width,
+                int src_height,
+                uint8* dst,
+                int dst_width,
+                int dst_height,
+                int dst_yoffset,
                 LIBYUV_BOOL interpolate) {
   // Chroma requires offset to multiple of 2.
   int dst_yoffset_even = dst_yoffset & ~1;
@@ -1643,26 +1838,21 @@
   int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
   const uint8* src_y = src;
   const uint8* src_u = src + src_width * src_height;
-  const uint8* src_v = src + src_width * src_height +
-                             src_halfwidth * src_halfheight;
+  const uint8* src_v =
+      src + src_width * src_height + src_halfwidth * src_halfheight;
   uint8* dst_y = dst + dst_yoffset_even * dst_width;
-  uint8* dst_u = dst + dst_width * dst_height +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  uint8* dst_u =
+      dst + dst_width * dst_height + (dst_yoffset_even >> 1) * dst_halfwidth;
   uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
                  (dst_yoffset_even >> 1) * dst_halfwidth;
-  if (!src || src_width <= 0 || src_height <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
+  if (!src || src_width <= 0 || src_height <= 0 || !dst || dst_width <= 0 ||
+      dst_height <= 0 || dst_yoffset_even < 0 ||
       dst_yoffset_even >= dst_height) {
     return -1;
   }
-  return I420Scale(src_y, src_width,
-                   src_u, src_halfwidth,
-                   src_v, src_halfwidth,
-                   src_width, src_height,
-                   dst_y, dst_width,
-                   dst_u, dst_halfwidth,
-                   dst_v, dst_halfwidth,
-                   dst_width, aheight,
+  return I420Scale(src_y, src_width, src_u, src_halfwidth, src_v, src_halfwidth,
+                   src_width, src_height, dst_y, dst_width, dst_u,
+                   dst_halfwidth, dst_v, dst_halfwidth, dst_width, aheight,
                    interpolate ? kFilterBox : kFilterNone);
 }
 

diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc
index ed76a9e..d64ba7a 100644
--- a/files/source/scale_any.cc
+++ b/files/source/scale_any.cc

@@ -19,16 +19,15 @@
 #endif
 
 // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 int dst_width, int x, int dx) {                               \
-      int n = dst_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
-      }                                                                        \
-      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
-             dst_width & MASK, x + n * dx, dx);                                \
-    }
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                        \
+  void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, \
+               int dx) {                                                   \
+    int n = dst_width & ~MASK;                                             \
+    if (n > 0) {                                                           \
+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                               \
+    }                                                                      \
+    TERP_C(dst_ptr + n * BPP, src_ptr, dst_width & MASK, x + n * dx, dx);  \
+  }
 
 #ifdef HAS_SCALEFILTERCOLS_NEON
 CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
@@ -37,167 +36,378 @@
 CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
 #endif
 #ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
-     ScaleARGBFilterCols_C, 4, 3)
+CANY(ScaleARGBFilterCols_Any_NEON,
+     ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C,
+     4,
+     3)
 #endif
 #undef CANY
 
 // Fixed scale down.
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                     dst_ptr + n * BPP, r);                                    \
-    }
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+  void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr,   \
+               int dst_width) {                                              \
+    int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+    int n = dst_width - r;                                                   \
+    if (n > 0) {                                                             \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+    }                                                                        \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                   dst_ptr + n * BPP, r);                                    \
+  }
 
 // Fixed scale down for odd source width.  Used by I420Blend subsampling.
 // Since dst_width is (width + 1) / 2, this function scales one less pixel
 // and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                     dst_ptr + n * BPP, r);                                    \
-    }
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+  void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr,   \
+               int dst_width) {                                              \
+    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \
+    int n = dst_width - r;                                                   \
+    if (n > 0) {                                                             \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+    }                                                                        \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                   dst_ptr + n * BPP, r);                                    \
+  }
 
 #ifdef HAS_SCALEROWDOWN2_SSSE3
 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
-      2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+      ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN2_AVX2
 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
-      ScaleRowDown2Linear_C, 2, 1, 31)
-SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
-      2, 1, 31)
-SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
-      2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+      ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      31)
 #endif
 #ifdef HAS_SCALEROWDOWN2_NEON
 SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_C, 2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+      ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+      ScaleRowDown2Linear_MSA,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+      ScaleRowDown2Box_MSA,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
 #endif
 #ifdef HAS_SCALEROWDOWN4_SSSE3
 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
-      4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+      ScaleRowDown4Box_SSSE3,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
 #endif
 #ifdef HAS_SCALEROWDOWN4_AVX2
 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
-      4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+      ScaleRowDown4Box_AVX2,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN4_NEON
 SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
-      4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+      ScaleRowDown4Box_NEON,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+      ScaleRowDown4Box_MSA,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN34_SSSE3
-SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_SSSE3,
+      ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+      ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+      ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
 #endif
 #ifdef HAS_SCALEROWDOWN34_NEON
-SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_NEON,
+      ScaleRowDown34_NEON,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+      ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+      ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
 #endif
 #ifdef HAS_SCALEROWDOWN38_SSSE3
-SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_Any_SSSE3,
+      ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+      ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+      ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      5)
 #endif
 #ifdef HAS_SCALEROWDOWN38_NEON
-SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_Any_NEON,
+      ScaleRowDown38_NEON,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+      ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+      ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+      ScaleRowDown38_MSA,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+      ScaleRowDown38_3_Box_MSA,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+      ScaleRowDown38_2_Box_MSA,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
 #endif
 
 #ifdef HAS_SCALEARGBROWDOWN2_SSE2
-SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
-      ScaleARGBRowDown2_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
-      ScaleARGBRowDown2Linear_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
-      ScaleARGBRowDown2Box_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+      ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+      ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+      ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWN2_NEON
-SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
-      ScaleARGBRowDown2_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
-      ScaleARGBRowDown2Linear_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
-      ScaleARGBRowDown2Box_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2_Any_NEON,
+      ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+      ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+      ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+      ScaleARGBRowDown2_MSA,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+      ScaleARGBRowDown2Linear_MSA,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+      ScaleARGBRowDown2Box_MSA,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
 #endif
 #undef SDANY
 
 // Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
-                     src_stepx, dst_ptr + n * BPP, r);                         \
-    }
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)      \
+  void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,  \
+               uint8* dst_ptr, int dst_width) {                            \
+    int r = (int)((unsigned int)dst_width % (MASK + 1));                   \
+    int n = dst_width - r;                                                 \
+    if (n > 0) {                                                           \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);       \
+    }                                                                      \
+    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
+                   dst_ptr + n * BPP, r);                                  \
+  }
 
 #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
-SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+       ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+       ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+       ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+       ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+       ScaleARGBRowDownEven_MSA,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+       ScaleARGBRowDownEvenBox_MSA,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
 #endif
 
 // Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
-  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
-      int n = src_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
-      }                                                                        \
-      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
-    }
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)          \
+  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
+    int n = src_width & ~MASK;                                         \
+    if (n > 0) {                                                       \
+      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                           \
+    }                                                                  \
+    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);         \
+  }
 
 #ifdef HAS_SCALEADDROW_SSE2
 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
@@ -208,14 +418,15 @@
 #ifdef HAS_SCALEADDROW_NEON
 SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
 #endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_DSPR2
+SAANY(ScaleAddRow_Any_DSPR2, ScaleAddRow_DSPR2, ScaleAddRow_C, 15)
+#endif
 #undef SAANY
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
-
-
-
-

diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc
index 17f51ae..1ea28f0 100644
--- a/files/source/scale_argb.cc
+++ b/files/source/scale_argb.cc

@@ -30,20 +30,31 @@
 // ScaleARGB ARGB, 1/2
 // This is an optimized version for scaling down a ARGB to 1/2 of
 // its original size.
-static void ScaleARGBDown2(int src_width, int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint8* src_argb, uint8* dst_argb,
-                           int x, int dx, int y, int dy,
+static void ScaleARGBDown2(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint8* src_argb,
+                           uint8* dst_argb,
+                           int x,
+                           int dx,
+                           int y,
+                           int dy,
                            enum FilterMode filtering) {
   int j;
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
                             uint8* dst_argb, int dst_width) =
-    filtering == kFilterNone ? ScaleARGBRowDown2_C :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
-        ScaleARGBRowDown2Box_C);
-  assert(dx == 65536 * 2);  // Test scale factor of 2.
+      filtering == kFilterNone
+          ? ScaleARGBRowDown2_C
+          : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
+                                        : ScaleARGBRowDown2Box_C);
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 2);      // Test scale factor of 2.
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
@@ -54,25 +65,49 @@
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
-        ScaleARGBRowDown2Box_Any_SSE2);
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_SSE2
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
+                                          : ScaleARGBRowDown2Box_Any_SSE2);
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
-          ScaleARGBRowDown2Box_SSE2);
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_SSE2
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
+                                            : ScaleARGBRowDown2Box_SSE2);
     }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
-        ScaleARGBRowDown2Box_Any_NEON);
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
+                                          : ScaleARGBRowDown2Box_Any_NEON);
     if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
-          ScaleARGBRowDown2Box_NEON);
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
+                                            : ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
+                                          : ScaleARGBRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_MSA
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
+                                            : ScaleARGBRowDown2Box_MSA);
     }
   }
 #endif
@@ -90,21 +125,32 @@
 // ScaleARGB ARGB, 1/4
 // This is an optimized version for scaling down a ARGB to 1/4 of
 // its original size.
-static void ScaleARGBDown4Box(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy) {
+static void ScaleARGBDown4Box(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8* src_argb,
+                              uint8* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy) {
   int j;
   // Allocate 2 rows of ARGB.
   const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
   align_buffer_64(row, kRowSize * 2);
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
-    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+                            uint8* dst_argb, int dst_width) =
+      ScaleARGBRowDown2Box_C;
   // Advance to odd row, even column.
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 4);      // Test scale factor of 4.
   assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -125,8 +171,8 @@
 
   for (j = 0; j < dst_height; ++j) {
     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
-                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+                      dst_width * 2);
     ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
     src_argb += row_stride;
     dst_argb += dst_stride;
@@ -137,11 +183,18 @@
 // ScaleARGB ARGB Even
 // This is an optimized version for scaling down a ARGB to even
 // multiple of its original size.
-static void ScaleARGBDownEven(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy,
+static void ScaleARGBDownEven(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8* src_argb,
+                              uint8* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy,
                               enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
@@ -149,26 +202,38 @@
   void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
                                int src_step, uint8* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  (void)src_width;
+  (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
-        ScaleARGBRowDownEven_Any_SSE2;
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
+                                     : ScaleARGBRowDownEven_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
-          ScaleARGBRowDownEven_SSE2;
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
     }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
-        ScaleARGBRowDownEven_Any_NEON;
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
+                                     : ScaleARGBRowDownEven_Any_NEON;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
-          ScaleARGBRowDownEven_NEON;
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
+                                     : ScaleARGBRowDownEven_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
     }
   }
 #endif
@@ -184,25 +249,32 @@
 }
 
 // Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width, int src_height,
-                                  int dst_width, int dst_height,
-                                  int src_stride, int dst_stride,
-                                  const uint8* src_argb, uint8* dst_argb,
-                                  int x, int dx, int y, int dy,
+static void ScaleARGBBilinearDown(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint8* src_argb,
+                                  uint8* dst_argb,
+                                  int x,
+                                  int dx,
+                                  int y,
+                                  int dy,
                                   enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
   int64 xlast = x + (int64)(dst_width - 1) * dx;
   int64 xl = (dx >= 0) ? x : xlast;
   int64 xr = (dx >= 0) ? xlast : x;
   int clip_src_width;
-  xl = (xl >> 16) & ~3;  // Left edge aligned.
-  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xl = (xl >> 16) & ~3;    // Left edge aligned.
+  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
   xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
   if (xr > src_width) {
     xr = src_width;
@@ -235,14 +307,22 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
+      IS_ALIGNED(src_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_DSPR2;
     if (IS_ALIGNED(clip_src_width, 4)) {
       InterpolateRow = InterpolateRow_DSPR2;
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@@ -286,18 +366,25 @@
 }
 
 // Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint8* src_argb, uint8* dst_argb,
-                                int x, int dx, int y, int dy,
+static void ScaleARGBBilinearUp(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint8* src_argb,
+                                uint8* dst_argb,
+                                int x,
+                                int dx,
+                                int y,
+                                int dy,
                                 enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -325,14 +412,22 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
+      IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_DSPR2;
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
   if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
   }
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -423,8 +518,10 @@
 
 #ifdef YUVSCALEUP
 // Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
-                                     int dst_width, int dst_height,
+static void ScaleYUVToARGBBilinearUp(int src_width,
+                                     int src_height,
+                                     int dst_width,
+                                     int dst_height,
                                      int src_stride_y,
                                      int src_stride_u,
                                      int src_stride_v,
@@ -433,14 +530,15 @@
                                      const uint8* src_u,
                                      const uint8* src_v,
                                      uint8* dst_argb,
-                                     int x, int dx, int y, int dy,
+                                     int x,
+                                     int dx,
+                                     int y,
+                                     int dy,
                                      enum FilterMode filtering) {
   int j;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
+  void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+                        const uint8* v_buf, uint8* rgb_buf, int width) =
+      I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
@@ -474,10 +572,18 @@
     I422ToARGBRow = I422ToARGBRow_DSPR2;
   }
 #endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
 
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
+  void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -503,18 +609,26 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
+      IS_ALIGNED(dst_stride_argb, 4)) {
     InterpolateRow = InterpolateRow_DSPR2;
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
   }
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -635,15 +749,23 @@
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
 
-static void ScaleARGBSimple(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_argb, uint8* dst_argb,
-                            int x, int dx, int y, int dy) {
+static void ScaleARGBSimple(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8* src_argb,
+                            uint8* dst_argb,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy) {
   int j;
-  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*ScaleARGBCols)(uint8 * dst_argb, const uint8* src_argb, int dst_width,
+                        int x, int dx) =
       (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+  (void)src_height;
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBCols = ScaleARGBCols_SSE2;
@@ -667,8 +789,8 @@
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
-                  dst_width, x, dx);
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
+                  dx);
     dst_argb += dst_stride;
     y += dy;
   }
@@ -677,11 +799,18 @@
 // ScaleARGB a ARGB.
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8* src, int src_stride,
-                      int src_width, int src_height,
-                      uint8* dst, int dst_stride,
-                      int dst_width, int dst_height,
-                      int clip_x, int clip_y, int clip_width, int clip_height,
+static void ScaleARGB(const uint8* src,
+                      int src_stride,
+                      int src_width,
+                      int src_height,
+                      uint8* dst,
+                      int dst_stride,
+                      int dst_width,
+                      int dst_height,
+                      int clip_x,
+                      int clip_y,
+                      int clip_width,
+                      int clip_height,
                       enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -690,8 +819,7 @@
   int dy = 0;
   // ARGB does not support box filter yet, but allow the user to pass it.
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
 
   // Negative src_height means invert the image.
@@ -700,17 +828,17 @@
     src = src + (src_height - 1) * src_stride;
     src_stride = -src_stride;
   }
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   if (clip_x) {
-    int64 clipf = (int64)(clip_x) * dx;
+    int64 clipf = (int64)(clip_x)*dx;
     x += (clipf & 0xffff);
     src += (clipf >> 16) * 4;
     dst += clip_x * 4;
   }
   if (clip_y) {
-    int64 clipf = (int64)(clip_y) * dy;
+    int64 clipf = (int64)(clip_y)*dy;
     y += (clipf & 0xffff);
     src += (clipf >> 16) * src_stride;
     dst += clip_y * dst_stride;
@@ -725,24 +853,20 @@
       if (!(dx & 0x10000) && !(dy & 0x10000)) {
         if (dx == 0x20000) {
           // Optimized 1/2 downsample.
-          ScaleARGBDown2(src_width, src_height,
-                         clip_width, clip_height,
-                         src_stride, dst_stride, src, dst,
-                         x, dx, y, dy, filtering);
+          ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+                         src_stride, dst_stride, src, dst, x, dx, y, dy,
+                         filtering);
           return;
         }
         if (dx == 0x40000 && filtering == kFilterBox) {
           // Optimized 1/4 box downsample.
-          ScaleARGBDown4Box(src_width, src_height,
-                            clip_width, clip_height,
-                            src_stride, dst_stride, src, dst,
-                            x, dx, y, dy);
+          ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
+                            src_stride, dst_stride, src, dst, x, dx, y, dy);
           return;
         }
-        ScaleARGBDownEven(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
+        ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
         return;
       }
       // Optimized odd scale down. ie 3, 5, 7, 9x.
@@ -759,96 +883,105 @@
   }
   if (dx == 0x10000 && (x & 0xffff) == 0) {
     // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical(src_height,
-                       clip_width, clip_height,
-                       src_stride, dst_stride, src, dst,
-                       x, y, dy, 4, filtering);
+    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+                       dst_stride, src, dst, x, y, dy, 4, filtering);
     return;
   }
   if (filtering && dy < 65536) {
-    ScaleARGBBilinearUp(src_width, src_height,
-                        clip_width, clip_height,
-                        src_stride, dst_stride, src, dst,
-                        x, dx, y, dy, filtering);
+    ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
     return;
   }
   if (filtering) {
-    ScaleARGBBilinearDown(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
+    ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
     return;
   }
-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
-                  src_stride, dst_stride, src, dst,
-                  x, dx, y, dy);
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
+                  dst_stride, src, dst, x, dx, y, dy);
 }
 
 LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
-                  int src_width, int src_height,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int dst_width, int dst_height,
-                  int clip_x, int clip_y, int clip_width, int clip_height,
+int ARGBScaleClip(const uint8* src_argb,
+                  int src_stride_argb,
+                  int src_width,
+                  int src_height,
+                  uint8* dst_argb,
+                  int dst_stride_argb,
+                  int dst_width,
+                  int dst_height,
+                  int clip_x,
+                  int clip_y,
+                  int clip_width,
+                  int clip_height,
                   enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
-      clip_x < 0 || clip_y < 0 ||
+  if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+      dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
       clip_width > 32768 || clip_height > 32768 ||
       (clip_x + clip_width) > dst_width ||
       (clip_y + clip_height) > dst_height) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            clip_x, clip_y, clip_width, clip_height, filtering);
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
+            clip_height, filtering);
   return 0;
 }
 
 // Scale an ARGB image.
 LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
-              int src_width, int src_height,
-              uint8* dst_argb, int dst_stride_argb,
-              int dst_width, int dst_height,
+int ARGBScale(const uint8* src_argb,
+              int src_stride_argb,
+              int src_width,
+              int src_height,
+              uint8* dst_argb,
+              int dst_stride_argb,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+  if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            0, 0, dst_width, dst_height, filtering);
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
+            filtering);
   return 0;
 }
 
 // Scale with YUV conversion to ARGB and clipping.
 LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
+int YUVToARGBScaleClip(const uint8* src_y,
+                       int src_stride_y,
+                       const uint8* src_u,
+                       int src_stride_u,
+                       const uint8* src_v,
+                       int src_stride_v,
                        uint32 src_fourcc,
-                       int src_width, int src_height,
-                       uint8* dst_argb, int dst_stride_argb,
+                       int src_width,
+                       int src_height,
+                       uint8* dst_argb,
+                       int dst_stride_argb,
                        uint32 dst_fourcc,
-                       int dst_width, int dst_height,
-                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       int dst_width,
+                       int dst_height,
+                       int clip_x,
+                       int clip_y,
+                       int clip_width,
+                       int clip_height,
                        enum FilterMode filtering) {
   uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
   int r;
-  I420ToARGB(src_y, src_stride_y,
-             src_u, src_stride_u,
-             src_v, src_stride_v,
-             argb_buffer, src_width * 4,
-             src_width, src_height);
+  (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
+  (void)dst_fourcc;
+  I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+             argb_buffer, src_width * 4, src_width, src_height);
 
-  r = ARGBScaleClip(argb_buffer, src_width * 4,
-                    src_width, src_height,
-                    dst_argb, dst_stride_argb,
-                    dst_width, dst_height,
-                    clip_x, clip_y, clip_width, clip_height,
-                    filtering);
+  r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+                    dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+                    clip_width, clip_height, filtering);
   free(argb_buffer);
   return r;
 }

diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc
index 3507aa4..1bef39d 100644
--- a/files/source/scale_common.cc
+++ b/files/source/scale_common.cc

@@ -28,9 +28,12 @@
 }
 
 // CPU agnostic row functions
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
+void ScaleRowDown2_C(const uint8* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8* dst,
+                     int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[1];
     dst[1] = src_ptr[3];
@@ -42,9 +45,12 @@
   }
 }
 
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
+void ScaleRowDown2_16_C(const uint16* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16* dst,
+                        int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[1];
     dst[1] = src_ptr[3];
@@ -56,10 +62,13 @@
   }
 }
 
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
+void ScaleRowDown2Linear_C(const uint8* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8* dst,
+                           int dst_width) {
   const uint8* s = src_ptr;
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + 1) >> 1;
     dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -71,10 +80,13 @@
   }
 }
 
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width) {
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16* dst,
+                              int dst_width) {
   const uint16* s = src_ptr;
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + 1) >> 1;
     dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -86,8 +98,10 @@
   }
 }
 
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
+void ScaleRowDown2Box_C(const uint8* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8* dst,
+                        int dst_width) {
   const uint8* s = src_ptr;
   const uint8* t = src_ptr + src_stride;
   int x;
@@ -103,8 +117,10 @@
   }
 }
 
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst,
+                            int dst_width) {
   const uint8* s = src_ptr;
   const uint8* t = src_ptr + src_stride;
   int x;
@@ -125,8 +141,10 @@
   dst[0] = (s[0] + t[0] + 1) >> 1;
 }
 
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
+void ScaleRowDown2Box_16_C(const uint16* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16* dst,
+                           int dst_width) {
   const uint16* s = src_ptr;
   const uint16* t = src_ptr + src_stride;
   int x;
@@ -142,9 +160,12 @@
   }
 }
 
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
+void ScaleRowDown4_C(const uint8* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8* dst,
+                     int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[2];
     dst[1] = src_ptr[6];
@@ -156,9 +177,12 @@
   }
 }
 
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
+void ScaleRowDown4_16_C(const uint16* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16* dst,
+                        int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[2];
     dst[1] = src_ptr[6];
@@ -170,81 +194,88 @@
   }
 }
 
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
+void ScaleRowDown4Box_C(const uint8* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8* dst,
+                        int dst_width) {
   intptr_t stride = src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
     dst += 2;
     src_ptr += 8;
   }
   if (dst_width & 1) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
   }
 }
 
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
+void ScaleRowDown4Box_16_C(const uint16* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16* dst,
+                           int dst_width) {
   intptr_t stride = src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
     dst += 2;
     src_ptr += 8;
   }
   if (dst_width & 1) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
   }
 }
 
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
+void ScaleRowDown34_C(const uint8* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8* dst,
+                      int dst_width) {
   int x;
+  (void)src_stride;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -255,9 +286,12 @@
   }
 }
 
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
+void ScaleRowDown34_16_C(const uint16* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16* dst,
+                         int dst_width) {
   int x;
+  (void)src_stride;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -269,8 +303,10 @@
 }
 
 // Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* d,
+                            int dst_width) {
   const uint8* s = src_ptr;
   const uint8* t = src_ptr + src_stride;
   int x;
@@ -291,8 +327,10 @@
   }
 }
 
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* d,
+                               int dst_width) {
   const uint16* s = src_ptr;
   const uint16* t = src_ptr + src_stride;
   int x;
@@ -314,8 +352,10 @@
 }
 
 // Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* d,
+                            int dst_width) {
   const uint8* s = src_ptr;
   const uint8* t = src_ptr + src_stride;
   int x;
@@ -336,8 +376,10 @@
   }
 }
 
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* d,
+                               int dst_width) {
   const uint16* s = src_ptr;
   const uint16* t = src_ptr + src_stride;
   int x;
@@ -359,8 +401,11 @@
 }
 
 // Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                 int dst_width, int x, int dx) {
+void ScaleCols_C(uint8* dst_ptr,
+                 const uint8* src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[0] = src_ptr[x >> 16];
@@ -374,8 +419,11 @@
   }
 }
 
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx) {
+void ScaleCols_16_C(uint16* dst_ptr,
+                    const uint16* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[0] = src_ptr[x >> 16];
@@ -390,9 +438,14 @@
 }
 
 // Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
-                    int dst_width, int x, int dx) {
+void ScaleColsUp2_C(uint8* dst_ptr,
+                    const uint8* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[1] = dst_ptr[0] = src_ptr[0];
     src_ptr += 1;
@@ -403,9 +456,14 @@
   }
 }
 
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleColsUp2_16_C(uint16* dst_ptr,
+                       const uint16* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[1] = dst_ptr[0] = src_ptr[0];
     src_ptr += 1;
@@ -418,16 +476,19 @@
 
 // (1-f)a + fb can be replaced with a + f(b-a)
 #if defined(__arm__) || defined(__aarch64__)
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#define BLENDER(a, b, f) \
+  (uint8)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 #else
-// inteluses 7 bit math with rounding.
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
-    (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+// Intel uses 7 bit math with rounding.
+#define BLENDER(a, b, f) \
+  (uint8)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
 #endif
 
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleFilterCols_C(uint8* dst_ptr,
+                       const uint8* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
@@ -450,8 +511,11 @@
   }
 }
 
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
-                         int dst_width, int x32, int dx) {
+void ScaleFilterCols64_C(uint8* dst_ptr,
+                         const uint8* src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx) {
   int64 x = (int64)(x32);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
@@ -477,11 +541,14 @@
 #undef BLENDER
 
 // Same as 8 bit arm blender but return is cast to uint16
-#define BLENDER(a, b, f) (uint16)((int)(a) + \
-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#define BLENDER(a, b, f) \
+  (uint16)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleFilterCols_16_C(uint16* dst_ptr,
+                          const uint16* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
@@ -504,8 +571,11 @@
   }
 }
 
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         int dst_width, int x32, int dx) {
+void ScaleFilterCols64_16_C(uint16* dst_ptr,
+                            const uint16* src_ptr,
+                            int dst_width,
+                            int x32,
+                            int dx) {
   int64 x = (int64)(x32);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
@@ -530,9 +600,12 @@
 }
 #undef BLENDER
 
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
+void ScaleRowDown38_C(const uint8* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8* dst,
+                      int dst_width) {
   int x;
+  (void)src_stride;
   assert(dst_width % 3 == 0);
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -543,9 +616,12 @@
   }
 }
 
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
+void ScaleRowDown38_16_C(const uint16* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16* dst,
+                         int dst_width) {
   int x;
+  (void)src_stride;
   assert(dst_width % 3 == 0);
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -559,25 +635,29 @@
 // 8x3 -> 3x1
 void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
                             ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+                            uint8* dst_ptr,
+                            int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
@@ -585,66 +665,80 @@
 
 void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
                                ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
+                               uint16* dst_ptr,
+                               int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
 // 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr,
+                            int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr,
+                               int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
@@ -680,11 +774,12 @@
 
 void ScaleARGBRowDown2_C(const uint8* src_argb,
                          ptrdiff_t src_stride,
-                         uint8* dst_argb, int dst_width) {
+                         uint8* dst_argb,
+                         int dst_width) {
   const uint32* src = (const uint32*)(src_argb);
   uint32* dst = (uint32*)(dst_argb);
-
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[1];
     dst[1] = src[3];
@@ -698,8 +793,10 @@
 
 void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
+                               uint8* dst_argb,
+                               int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width; ++x) {
     dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
     dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
@@ -710,29 +807,37 @@
   }
 }
 
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDown2Box_C(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb,
+                            int dst_width) {
   int x;
   for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
     src_argb += 8;
     dst_argb += 4;
   }
 }
 
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEven_C(const uint8* src_argb,
+                            ptrdiff_t src_stride,
                             int src_stepx,
-                            uint8* dst_argb, int dst_width) {
+                            uint8* dst_argb,
+                            int dst_width) {
   const uint32* src = (const uint32*)(src_argb);
   uint32* dst = (uint32*)(dst_argb);
-
+  (void)src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[0];
@@ -748,25 +853,33 @@
 void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
                                ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width) {
+                               uint8* dst_argb,
+                               int dst_width) {
   int x;
   for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
     src_argb += src_stepx * 4;
     dst_argb += 4;
   }
 }
 
 // Scales a single row of pixels using point sampling.
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx) {
+void ScaleARGBCols_C(uint8* dst_argb,
+                     const uint8* src_argb,
+                     int dst_width,
+                     int x,
+                     int dx) {
   const uint32* src = (const uint32*)(src_argb);
   uint32* dst = (uint32*)(dst_argb);
   int j;
@@ -782,8 +895,11 @@
   }
 }
 
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
-                       int dst_width, int x32, int dx) {
+void ScaleARGBCols64_C(uint8* dst_argb,
+                       const uint8* src_argb,
+                       int dst_width,
+                       int x32,
+                       int dx) {
   int64 x = (int64)(x32);
   const uint32* src = (const uint32*)(src_argb);
   uint32* dst = (uint32*)(dst_argb);
@@ -801,11 +917,16 @@
 }
 
 // Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+void ScaleARGBColsUp2_C(uint8* dst_argb,
+                        const uint8* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
   const uint32* src = (const uint32*)(src_argb);
   uint32* dst = (uint32*)(dst_argb);
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[1] = dst[0] = src[0];
     src += 1;
@@ -818,15 +939,18 @@
 
 // TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
-#define BLENDERC(a, b, f, s) (uint32)( \
-    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) \
-    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
-    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+  (uint32)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f)                                                 \
+  BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
+      BLENDERC(a, b, f, 0)
 
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_C(uint8* dst_argb,
+                           const uint8* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
   const uint32* src = (const uint32*)(src_argb);
   uint32* dst = (uint32*)(dst_argb);
   int j;
@@ -854,8 +978,11 @@
   }
 }
 
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
-                             int dst_width, int x32, int dx) {
+void ScaleARGBFilterCols64_C(uint8* dst_argb,
+                             const uint8* src_argb,
+                             int dst_width,
+                             int x32,
+                             int dx) {
   int64 x = (int64)(x32);
   const uint32* src = (const uint32*)(src_argb);
   uint32* dst = (uint32*)(dst_argb);
@@ -889,16 +1016,22 @@
 
 // Scale plane vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_argb, uint8* dst_argb,
-                        int x, int y, int dy,
-                        int bpp, enum FilterMode filtering) {
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8* src_argb,
+                        uint8* dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher bpp.
   int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
+  void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   int j;
   assert(bpp >= 1 && bpp <= 4);
@@ -931,15 +1064,23 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
+      IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_DSPR2;
     if (IS_ALIGNED(dst_width_bytes, 4)) {
       InterpolateRow = InterpolateRow_DSPR2;
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
   for (j = 0; j < dst_height; ++j) {
     int yi;
     int yf;
@@ -948,23 +1089,29 @@
     }
     yi = y >> 16;
     yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_bytes, yf);
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_bytes, yf);
     dst_argb += dst_stride;
     y += dy;
   }
 }
 void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering) {
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16* src_argb,
+                           uint16* dst_argb,
+                           int x,
+                           int y,
+                           int dy,
+                           int wpp,
+                           enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher wpp.
   int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
+  void (*InterpolateRow)(uint16 * dst_argb, const uint16* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   int j;
   assert(wpp >= 1 && wpp <= 2);
@@ -1005,9 +1152,9 @@
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
+      IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_16_DSPR2;
     if (IS_ALIGNED(dst_width_bytes, 4)) {
       InterpolateRow = InterpolateRow_16_DSPR2;
@@ -1022,16 +1169,18 @@
     }
     yi = y >> 16;
     yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_words, yf);
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_words, yf);
     dst_argb += dst_stride;
     y += dy;
   }
 }
 
 // Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
-                                  int dst_width, int dst_height,
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
                                   enum FilterMode filtering) {
   if (src_width < 0) {
     src_width = -src_width;
@@ -1078,17 +1227,21 @@
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv1_C(int num, int div) {
-  return (int)((((int64)(num) << 16) - 0x00010001) /
-                          (div - 1));
+  return (int)((((int64)(num) << 16) - 0x00010001) / (div - 1));
 }
 
 #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
 
 // Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
-                int dst_width, int dst_height,
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering,
-                int* x, int* y, int* dx, int* dy) {
+                int* x,
+                int* y,
+                int* dx,
+                int* dy) {
   assert(x != NULL);
   assert(y != NULL);
   assert(dx != NULL);
@@ -1120,7 +1273,7 @@
       *x = 0;
     }
     if (dst_height <= src_height) {
-      *dy = FixedDiv(src_height,  dst_height);
+      *dy = FixedDiv(src_height, dst_height);
       *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
     } else if (dst_height > 1) {
       *dy = FixedDiv1(src_height, dst_height);

diff --git a/files/source/scale_mips.cc b/files/source/scale_dspr2.cc
similarity index 60%
rename from files/source/scale_mips.cc
rename to files/source/scale_dspr2.cc
index ae95307..ddedcbf 100644
--- a/files/source/scale_mips.cc
+++ b/files/source/scale_dspr2.cc

@@ -17,168 +17,167 @@
 #endif
 
 // This module is for GCC MIPS DSPR2
-#if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
+#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
+    (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
 
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
+void ScaleRowDown2_DSPR2(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst,
+                         int dst_width) {
   __asm__ __volatile__(
-    ".set push                                     \n"
-    ".set noreorder                                \n"
+      ".set push                                     \n"
+      ".set noreorder                                \n"
 
-    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
-    "beqz           $t9, 2f                        \n"
-    " nop                                          \n"
+      "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
+      "beqz           $t9, 2f                        \n"
+      " nop                                          \n"
 
-  "1:                                              \n"
-    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
-    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
-    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
-    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
-    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
-    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
-    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
-    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
-    // TODO(fbarchard): Use odd pixels instead of even.
-    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
-    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
-    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
-    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
-    "addiu          %[src_ptr], %[src_ptr], 32     \n"
-    "addiu          $t9, $t9, -1                   \n"
-    "sw             $t8, 0(%[dst])                 \n"
-    "sw             $t0, 4(%[dst])                 \n"
-    "sw             $t1, 8(%[dst])                 \n"
-    "sw             $t2, 12(%[dst])                \n"
-    "bgtz           $t9, 1b                        \n"
-    " addiu         %[dst], %[dst], 16             \n"
+      "1:                                            \n"
+      "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
+      "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
+      "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
+      "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
+      "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
+      "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
+      "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
+      "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
+      // TODO(fbarchard): Use odd pixels instead of even.
+      "precrq.qb.ph   $t8, $t1, $t0                  \n"  // |7|5|3|1|
+      "precrq.qb.ph   $t0, $t3, $t2                  \n"  // |15|13|11|9|
+      "precrq.qb.ph   $t1, $t5, $t4                  \n"  // |23|21|19|17|
+      "precrq.qb.ph   $t2, $t7, $t6                  \n"  // |31|29|27|25|
+      "addiu          %[src_ptr], %[src_ptr], 32     \n"
+      "addiu          $t9, $t9, -1                   \n"
+      "sw             $t8, 0(%[dst])                 \n"
+      "sw             $t0, 4(%[dst])                 \n"
+      "sw             $t1, 8(%[dst])                 \n"
+      "sw             $t2, 12(%[dst])                \n"
+      "bgtz           $t9, 1b                        \n"
+      " addiu         %[dst], %[dst], 16             \n"
 
-  "2:                                              \n"
-    "andi           $t9, %[dst_width], 0xf         \n"  // residue
-    "beqz           $t9, 3f                        \n"
-    " nop                                          \n"
+      "2:                                            \n"
+      "andi           $t9, %[dst_width], 0xf         \n"  // residue
+      "beqz           $t9, 3f                        \n"
+      " nop                                          \n"
 
-  "21:                                             \n"
-    "lbu            $t0, 0(%[src_ptr])             \n"
-    "addiu          %[src_ptr], %[src_ptr], 2      \n"
-    "addiu          $t9, $t9, -1                   \n"
-    "sb             $t0, 0(%[dst])                 \n"
-    "bgtz           $t9, 21b                       \n"
-    " addiu         %[dst], %[dst], 1              \n"
+      "21:                                           \n"
+      "lbu            $t0, 1(%[src_ptr])             \n"
+      "addiu          %[src_ptr], %[src_ptr], 2      \n"
+      "addiu          $t9, $t9, -1                   \n"
+      "sb             $t0, 0(%[dst])                 \n"
+      "bgtz           $t9, 21b                       \n"
+      " addiu         %[dst], %[dst], 1              \n"
 
-  "3:                                              \n"
-    ".set pop                                      \n"
-  : [src_ptr] "+r" (src_ptr),
-    [dst] "+r" (dst)
-  : [dst_width] "r" (dst_width)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
+      "3:                                            \n"
+      ".set pop                                      \n"
+      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
+      : [dst_width] "r"(dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
 }
 
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst,
+                            int dst_width) {
   const uint8* t = src_ptr + src_stride;
 
-  __asm__ __volatile__ (
-    ".set push                                    \n"
-    ".set noreorder                               \n"
+  __asm__ __volatile__(
+      ".set push                                    \n"
+      ".set noreorder                               \n"
 
-    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
-    "bltz           $t9, 2f                       \n"
-    " nop                                         \n"
+      "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
+      "bltz           $t9, 2f                       \n"
+      " nop                                         \n"
 
-  "1:                                             \n"
-    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
-    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
-    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
-    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
-    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
-    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
-    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
-    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
-    "addiu          $t9, $t9, -1                  \n"
-    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
-    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
-    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
-    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
-    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
-    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
-    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
-    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
-    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
-    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
-    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
-    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
-    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
-    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
-    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
-    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
-    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
-    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
-    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
-    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
-    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
-    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
-    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
-    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
-    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
-    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
-    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
-    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
-    "addiu          %[src_ptr], %[src_ptr], 16    \n"
-    "addiu          %[t], %[t], 16                \n"
-    "sb             $t0, 0(%[dst])                \n"
-    "sb             $t4, 1(%[dst])                \n"
-    "sb             $t1, 2(%[dst])                \n"
-    "sb             $t5, 3(%[dst])                \n"
-    "sb             $t2, 4(%[dst])                \n"
-    "sb             $t6, 5(%[dst])                \n"
-    "sb             $t3, 6(%[dst])                \n"
-    "sb             $t7, 7(%[dst])                \n"
-    "bgtz           $t9, 1b                       \n"
-    " addiu         %[dst], %[dst], 8             \n"
+      "1:                                           \n"
+      "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
+      "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
+      "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
+      "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
+      "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
+      "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
+      "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
+      "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
+      "addiu          $t9, $t9, -1                  \n"
+      "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
+      "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
+      "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
+      "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
+      "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
+      "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
+      "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
+      "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
+      "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
+      "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
+      "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
+      "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
+      "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
+      "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
+      "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
+      "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
+      "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
+      "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
+      "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
+      "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
+      "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
+      "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
+      "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
+      "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
+      "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
+      "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
+      "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
+      "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
+      "addiu          %[src_ptr], %[src_ptr], 16    \n"
+      "addiu          %[t], %[t], 16                \n"
+      "sb             $t0, 0(%[dst])                \n"
+      "sb             $t4, 1(%[dst])                \n"
+      "sb             $t1, 2(%[dst])                \n"
+      "sb             $t5, 3(%[dst])                \n"
+      "sb             $t2, 4(%[dst])                \n"
+      "sb             $t6, 5(%[dst])                \n"
+      "sb             $t3, 6(%[dst])                \n"
+      "sb             $t7, 7(%[dst])                \n"
+      "bgtz           $t9, 1b                       \n"
+      " addiu         %[dst], %[dst], 8             \n"
 
-  "2:                                             \n"
-    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
-    "beqz           $t9, 3f                       \n"
-    " nop                                         \n"
+      "2:                                           \n"
+      "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
+      "beqz           $t9, 3f                       \n"
+      " nop                                         \n"
 
-    "21:                                          \n"
-    "lwr            $t1, 0(%[src_ptr])            \n"
-    "lwl            $t1, 3(%[src_ptr])            \n"
-    "lwr            $t2, 0(%[t])                  \n"
-    "lwl            $t2, 3(%[t])                  \n"
-    "srl            $t8, $t1, 16                  \n"
-    "ins            $t1, $t2, 16, 16              \n"
-    "ins            $t2, $t8, 0, 16               \n"
-    "raddu.w.qb     $t1, $t1                      \n"
-    "raddu.w.qb     $t2, $t2                      \n"
-    "shra_r.w       $t1, $t1, 2                   \n"
-    "shra_r.w       $t2, $t2, 2                   \n"
-    "sb             $t1, 0(%[dst])                \n"
-    "sb             $t2, 1(%[dst])                \n"
-    "addiu          %[src_ptr], %[src_ptr], 4     \n"
-    "addiu          $t9, $t9, -2                  \n"
-    "addiu          %[t], %[t], 4                 \n"
-    "bgtz           $t9, 21b                      \n"
-    " addiu         %[dst], %[dst], 2             \n"
+      "21:                                          \n"
+      "lwr            $t1, 0(%[src_ptr])            \n"
+      "lwl            $t1, 3(%[src_ptr])            \n"
+      "lwr            $t2, 0(%[t])                  \n"
+      "lwl            $t2, 3(%[t])                  \n"
+      "srl            $t8, $t1, 16                  \n"
+      "ins            $t1, $t2, 16, 16              \n"
+      "ins            $t2, $t8, 0, 16               \n"
+      "raddu.w.qb     $t1, $t1                      \n"
+      "raddu.w.qb     $t2, $t2                      \n"
+      "shra_r.w       $t1, $t1, 2                   \n"
+      "shra_r.w       $t2, $t2, 2                   \n"
+      "sb             $t1, 0(%[dst])                \n"
+      "sb             $t2, 1(%[dst])                \n"
+      "addiu          %[src_ptr], %[src_ptr], 4     \n"
+      "addiu          $t9, $t9, -2                  \n"
+      "addiu          %[t], %[t], 4                 \n"
+      "bgtz           $t9, 21b                      \n"
+      " addiu         %[dst], %[dst], 2             \n"
 
-  "3:                                             \n"
-    ".set pop                                     \n"
+      "3:                                           \n"
+      ".set pop                                     \n"
 
-  : [src_ptr] "+r" (src_ptr),
-    [dst] "+r" (dst), [t] "+r" (t)
-  : [dst_width] "r" (dst_width)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
+      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
+      : [dst_width] "r"(dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
 }
 
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
+void ScaleRowDown4_DSPR2(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst,
+                         int dst_width) {
+  __asm__ __volatile__(
       ".set push                                    \n"
       ".set noreorder                               \n"
 
@@ -186,7 +185,7 @@
       "beqz           $t9, 2f                       \n"
       " nop                                         \n"
 
-     "1:                                            \n"
+      "1:                                           \n"
       "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
       "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
       "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
@@ -199,8 +198,8 @@
       "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
       "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
       "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
+      "precrq.qb.ph   $t1, $t2, $t1                 \n"  // |14|10|6|2|
+      "precrq.qb.ph   $t5, $t6, $t5                 \n"  // |30|26|22|18|
       "addiu          %[src_ptr], %[src_ptr], 32    \n"
       "addiu          $t9, $t9, -1                  \n"
       "sw             $t1, 0(%[dst])                \n"
@@ -208,44 +207,43 @@
       "bgtz           $t9, 1b                       \n"
       " addiu         %[dst], %[dst], 8             \n"
 
-    "2:                                             \n"
+      "2:                                           \n"
       "andi           $t9, %[dst_width], 7          \n"  // residue
       "beqz           $t9, 3f                       \n"
       " nop                                         \n"
 
-    "21:                                            \n"
-      "lbu            $t1, 0(%[src_ptr])            \n"
+      "21:                                          \n"
+      "lbu            $t1, 2(%[src_ptr])            \n"
       "addiu          %[src_ptr], %[src_ptr], 4     \n"
       "addiu          $t9, $t9, -1                  \n"
       "sb             $t1, 0(%[dst])                \n"
       "bgtz           $t9, 21b                      \n"
       " addiu         %[dst], %[dst], 1             \n"
 
-    "3:                                             \n"
+      "3:                                           \n"
       ".set pop                                     \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst)
-      : [dst_width] "r" (dst_width)
-      : "t1", "t2", "t3", "t4", "t5",
-        "t6", "t7", "t8", "t9"
-  );
+      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
+      : [dst_width] "r"(dst_width)
+      : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
 }
 
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst,
+                            int dst_width) {
   intptr_t stride = src_stride;
   const uint8* s1 = src_ptr + stride;
   const uint8* s2 = s1 + stride;
   const uint8* s3 = s2 + stride;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       ".set push                                  \n"
       ".set noreorder                             \n"
 
       "srl           $t9, %[dst_width], 1         \n"
       "andi          $t8, %[dst_width], 1         \n"
 
-     "1:                                          \n"
+      "1:                                         \n"
       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
       "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
@@ -299,23 +297,20 @@
       "2:                                         \n"
       ".set pop                                   \n"
 
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [s1] "+r" (s1),
-        [s2] "+r" (s2),
-        [s3] "+r" (s3)
-      : [dst_width] "r" (dst_width)
-      : "t0", "t1", "t2", "t3", "t4", "t5",
-        "t6","t7", "t8", "t9"
-  );
+      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
+        [s3] "+r"(s3)
+      : [dst_width] "r"(dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
 }
 
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
+void ScaleRowDown34_DSPR2(const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8* dst,
+                          int dst_width) {
+  __asm__ __volatile__(
       ".set push                                          \n"
       ".set noreorder                                     \n"
-    "1:                                                   \n"
+      "1:                                                 \n"
       "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
       "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
       "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
@@ -347,23 +342,21 @@
       "bnez            %[dst_width], 1b                   \n"
       " addiu          %[dst], %[dst], 24                 \n"
       ".set pop                                           \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [dst_width] "+r" (dst_width)
+      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
       :
-      : "t0", "t1", "t2", "t3", "t4", "t5",
-        "t6","t7", "t8", "t9"
-  );
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
 }
 
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
-  __asm__ __volatile__ (
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* d,
+                                int dst_width) {
+  __asm__ __volatile__(
       ".set push                                         \n"
       ".set noreorder                                    \n"
       "repl.ph           $t3, 3                          \n"  // 0x00030003
 
-    "1:                                                  \n"
+      "1:                                                \n"
       "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
       "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
       "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
@@ -400,26 +393,24 @@
       "sb                $t6, 2(%[d])                    \n"
       "bgtz              %[dst_width], 1b                \n"
       " addiu            %[d], %[d], 3                   \n"
-    "3:                                                  \n"
+      "3:                                                \n"
       ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [src_stride] "+r" (src_stride),
-        [d] "+r" (d),
-        [dst_width] "+r" (dst_width)
+      : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
+        [dst_width] "+r"(dst_width)
       :
-      : "t0", "t1", "t2", "t3",
-        "t4", "t5", "t6"
-  );
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
 }
 
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
-  __asm__ __volatile__ (
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* d,
+                                int dst_width) {
+  __asm__ __volatile__(
       ".set push                                           \n"
       ".set noreorder                                      \n"
       "repl.ph           $t2, 3                            \n"  // 0x00030003
 
-    "1:                                                    \n"
+      "1:                                                  \n"
       "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
       "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
       "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
@@ -452,25 +443,23 @@
       "sb                $t6, 2(%[d])                      \n"
       "bgtz              %[dst_width], 1b                  \n"
       " addiu            %[d], %[d], 3                     \n"
-    "3:                                                    \n"
+      "3:                                                  \n"
       ".set pop                                            \n"
-      : [src_ptr] "+r" (src_ptr),
-        [src_stride] "+r" (src_stride),
-        [d] "+r" (d),
-        [dst_width] "+r" (dst_width)
+      : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
+        [dst_width] "+r"(dst_width)
       :
-      : "t0", "t1", "t2", "t3",
-        "t4", "t5", "t6"
-  );
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
 }
 
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
+void ScaleRowDown38_DSPR2(const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8* dst,
+                          int dst_width) {
+  __asm__ __volatile__(
       ".set push                                     \n"
       ".set noreorder                                \n"
 
-    "1:                                              \n"
+      "1:                                            \n"
       "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
       "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
       "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
@@ -501,26 +490,24 @@
       "bgez       $t8, 1b                            \n"
       " addiu     %[dst], %[dst], 12                 \n"
       ".set pop                                      \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [dst_width] "+r" (dst_width)
+      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
       :
-      : "t0", "t1", "t2", "t3", "t4",
-        "t5", "t6", "t7", "t8"
-  );
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
 }
 
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr,
+                                int dst_width) {
   intptr_t stride = src_stride;
   const uint8* t = src_ptr + stride;
   const int c = 0x2AAA;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       ".set push                                         \n"
       ".set noreorder                                    \n"
 
-    "1:                                                  \n"
+      "1:                                                \n"
       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
       "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
@@ -554,18 +541,16 @@
       "bgtz            %[dst_width], 1b                  \n"
       " sb             $t0, -3(%[dst_ptr])               \n"
       ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst_ptr] "+r" (dst_ptr),
-        [t] "+r" (t),
-        [dst_width] "+r" (dst_width)
-      : [c] "r" (c)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
-  );
+      : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
+        [dst_width] "+r"(dst_width)
+      : [c] "r"(c)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
 }
 
 void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+                                uint8* dst_ptr,
+                                int dst_width) {
   intptr_t stride = src_stride;
   const uint8* s1 = src_ptr + stride;
   stride += stride;
@@ -573,11 +558,11 @@
   const int c1 = 0x1C71;
   const int c2 = 0x2AAA;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       ".set push                                         \n"
       ".set noreorder                                    \n"
 
-    "1:                                                  \n"
+      "1:                                                \n"
       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
       "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
@@ -624,15 +609,55 @@
       "bgtz            %[dst_width], 1b                  \n"
       " sb             $t0, -3(%[dst_ptr])               \n"
       ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst_ptr] "+r" (dst_ptr),
-        [s1] "+r" (s1),
-        [s2] "+r" (s2),
-        [dst_width] "+r" (dst_width)
-      : [c1] "r" (c1), [c2] "r" (c2)
-      : "t0", "t1", "t2", "t3", "t4",
-        "t5", "t6", "t7", "t8"
-  );
+      : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
+        [s2] "+r"(s2), [dst_width] "+r"(dst_width)
+      : [c1] "r"(c1), [c2] "r"(c2)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
+}
+
+void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  int x;
+  for (x = 0; x < ((src_width - 1)); x += 8) {
+    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
+    uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
+    __asm__ __volatile__(
+        ".set push                                                \n"
+        ".set noreorder                                           \n"
+        "lw                %[tmp_t5],   0(%[src_ptr])             \n"
+        "lw                %[tmp_t6],   4(%[src_ptr])             \n"
+        "lw                %[tmp_t1],   0(%[dst_ptr])             \n"
+        "lw                %[tmp_t2],   4(%[dst_ptr])             \n"
+        "lw                %[tmp_t3],   8(%[dst_ptr])             \n"
+        "lw                %[tmp_t4],   12(%[dst_ptr])            \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t5]                 \n"
+        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t5]                 \n"
+        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t2],   %[tmp_t2],     %[tmp_t8]  \n"
+        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t6]                 \n"
+        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t6]                 \n"
+        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t7]  \n"
+        "addu.ph           %[tmp_t4],   %[tmp_t4],     %[tmp_t8]  \n"
+        "sw                %[tmp_t1],   0(%[dst_ptr])             \n"
+        "sw                %[tmp_t2],   4(%[dst_ptr])             \n"
+        "sw                %[tmp_t3],   8(%[dst_ptr])             \n"
+        "sw                %[tmp_t4],   12(%[dst_ptr])            \n"
+        ".set pop                                                 \n"
+        :
+        [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
+        [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
+        [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
+        : [dst_ptr] "r"(dst_ptr));
+    src_ptr += 8;
+    dst_ptr += 8;
+  }
+
+  if ((src_width)&7) {
+    for (x = 0; x < ((src_width - 1) & 7); x += 1) {
+      dst_ptr[0] += src_ptr[0];
+      src_ptr += 1;
+      dst_ptr += 1;
+    }
+  }
 }
 
 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
@@ -641,4 +666,3 @@
 }  // extern "C"
 }  // namespace libyuv
 #endif
-

diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc
index e2f8854..f0ac56f 100644
--- a/files/source/scale_gcc.cc
+++ b/files/source/scale_gcc.cc

@@ -21,85 +21,82 @@
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 
 // Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                       128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                       128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                       128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
 
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                        10, 11, 12, 13, 13, 14, 14, 15};
 
 // Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
 
 // Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
 
 // Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
 
 // Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
+static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
 
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                         128, 128, 128, 128, 128, 128, 128, 128};
 
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                         6,   8,   11,  14,  128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                        128, 128, 128, 128, 128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                         6,   7,   12,  13,  128, 128, 128, 128};
 
 // Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                            65536 / 9, 65536 / 6, 0,         0};
 
 // Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                         11, 128, 14, 128, 128, 128, 128, 128};
 
 // Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                         12, 128, 15, 128, 128, 128, 128, 128};
 
 // Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                         13, 128, 128, 128, 128, 128, 128, 128};
 
 // Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                           65536 / 3, 65536 / 2, 0,         0};
 
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
 // objdump -D yuvscaler.obj >yuvscaler.txt
 
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -120,8 +117,11 @@
   );
 }
 
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr,
+                               int dst_width) {
+  (void)src_stride;
   asm volatile (
     "pcmpeqb    %%xmm4,%%xmm4                  \n"
     "psrlw      $0xf,%%xmm4                    \n"
@@ -149,8 +149,10 @@
   );
 }
 
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr,
+                            int dst_width) {
   asm volatile (
     "pcmpeqb    %%xmm4,%%xmm4                  \n"
     "psrlw      $0xf,%%xmm4                    \n"
@@ -189,8 +191,11 @@
 }
 
 #ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2_AVX2(const uint8* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -213,8 +218,11 @@
   );
 }
 
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8* dst_ptr,
+                              int dst_width) {
+  (void)src_stride;
   asm volatile (
     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
@@ -244,8 +252,10 @@
   );
 }
 
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8* dst_ptr,
+                           int dst_width) {
   asm volatile (
     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
@@ -286,8 +296,11 @@
 }
 #endif  // HAS_SCALEROWDOWN2_AVX2
 
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrld     $0x18,%%xmm5                    \n"
@@ -314,8 +327,10 @@
   );
 }
 
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr,
+                            int dst_width) {
   intptr_t stridex3;
   asm volatile (
     "pcmpeqb    %%xmm4,%%xmm4                  \n"
@@ -368,10 +383,12 @@
   );
 }
 
-
 #ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4_AVX2(const uint8* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
   asm volatile (
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
     "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
@@ -400,8 +417,10 @@
   );
 }
 
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8* dst_ptr,
+                           int dst_width) {
   asm volatile (
     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
@@ -455,17 +474,20 @@
 }
 #endif  // HAS_SCALEROWDOWN4_AVX2
 
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm3                       \n"
-    "movdqa    %1,%%xmm4                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kShuf0),  // %0
-    "m"(kShuf1),  // %1
-    "m"(kShuf2)   // %2
-  );
+void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa    %0,%%xmm3                       \n"
+      "movdqa    %1,%%xmm4                       \n"
+      "movdqa    %2,%%xmm5                       \n"
+      :
+      : "m"(kShuf0),  // %0
+        "m"(kShuf1),  // %1
+        "m"(kShuf2)   // %2
+      );
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -492,25 +514,26 @@
 
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
+                                uint8* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -557,25 +580,26 @@
 
 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
+                                uint8* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
 
   asm volatile (
     LABELALIGN
@@ -624,8 +648,11 @@
   );
 }
 
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
   asm volatile (
     "movdqa    %3,%%xmm4                       \n"
     "movdqa    %4,%%xmm5                       \n"
@@ -655,18 +682,19 @@
 
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "movdqa    %3,%%xmm5                       \n"
-  :
-  : "m"(kShufAb0),   // %0
-    "m"(kShufAb1),   // %1
-    "m"(kShufAb2),   // %2
-    "m"(kScaleAb2)   // %3
-  );
+                                uint8* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "movdqa    %3,%%xmm5                       \n"
+      :
+      : "m"(kShufAb0),  // %0
+        "m"(kShufAb1),  // %1
+        "m"(kShufAb2),  // %2
+        "m"(kScaleAb2)  // %3
+      );
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -700,17 +728,18 @@
 
 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-  :
-  : "m"(kShufAc),    // %0
-    "m"(kShufAc3),   // %1
-    "m"(kScaleAc33)  // %2
-  );
+                                uint8* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      :
+      : "m"(kShufAc),    // %0
+        "m"(kShufAc3),   // %1
+        "m"(kScaleAc33)  // %2
+      );
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -790,7 +819,6 @@
   );
 }
 
-
 #ifdef HAS_SCALEADDROW_AVX2
 // Reads 32 bytes and accumulates to 32 shorts at a time.
 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
@@ -823,17 +851,19 @@
 
 // Constant for making pixels signed to avoid pmaddubsw
 // saturation.
-static uvec8 kFsub80 =
-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
 
 // Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
-  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                         0x4040, 0x4040, 0x4040, 0x4040};
 
 // Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
+void ScaleFilterCols_SSSE3(uint8* dst_ptr,
+                           const uint8* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx) {
   intptr_t x0, x1, temp_pixel;
   asm volatile (
     "movd      %6,%%xmm2                       \n"
@@ -867,7 +897,7 @@
     "pshufb    %%xmm5,%%xmm1                   \n"
     "punpcklwd %%xmm4,%%xmm0                   \n"
     "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-    "pxor      %%xmm6,%%xmm1                   \n"  // 128 -f = (f ^ 127 ) + 1
+    "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) + 1
     "paddusb   %%xmm7,%%xmm1                   \n"
     "pmaddubsw %%xmm0,%%xmm1                   \n"
     "pextrw    $0x1,%%xmm2,%k3                 \n"
@@ -925,8 +955,13 @@
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleColsUp2_SSE2(uint8* dst_ptr,
+                       const uint8* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  (void)x;
+  (void)dx;
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -950,7 +985,9 @@
 
 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
                             ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
+                            uint8* dst_argb,
+                            int dst_width) {
+  (void)src_stride;
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -971,7 +1008,9 @@
 
 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
                                   ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
+                                  uint8* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -995,7 +1034,8 @@
 
 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
+                               uint8* dst_argb,
+                               int dst_width) {
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -1025,10 +1065,14 @@
 
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb,
+                               int dst_width) {
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
+  (void)src_stride;
   asm volatile (
     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
@@ -1059,8 +1103,10 @@
 // Blends four 2x2 to 4x1.
 // Alignment requirement: dst_argb 16 byte aligned.
 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride, int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb,
+                                  int dst_width) {
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   intptr_t row1 = (intptr_t)(src_stride);
@@ -1102,8 +1148,11 @@
   );
 }
 
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+void ScaleARGBCols_SSE2(uint8* dst_argb,
+                        const uint8* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
   intptr_t x0, x1;
   asm volatile (
     "movd      %5,%%xmm2                       \n"
@@ -1171,8 +1220,13 @@
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
+                           const uint8* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  (void)x;
+  (void)dx;
   asm volatile (
     LABELALIGN
   "1:                                          \n"
@@ -1197,26 +1251,29 @@
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
 static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
 static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
+                               const uint8* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx) {
   intptr_t x0, x1;
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm5                       \n"
-  :
-  : "m"(kShuffleColARGB),  // %0
-    "m"(kShuffleFractions)  // %1
-  );
+  asm volatile(
+      "movdqa    %0,%%xmm4                       \n"
+      "movdqa    %1,%%xmm5                       \n"
+      :
+      : "m"(kShuffleColARGB),   // %0
+        "m"(kShuffleFractions)  // %1
+      );
 
   asm volatile (
     "movd      %5,%%xmm2                       \n"
@@ -1283,34 +1340,32 @@
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
   return num;
 }
 
 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "sub       $0x10001,%%eax                  \n"
-    "sbb       $0x0,%%edx                      \n"
-    "sub       $0x1,%1                         \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "sub       $0x10001,%%eax                  \n"
+      "sbb       $0x0,%%edx                      \n"
+      "sub       $0x1,%1                         \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
   return num;
 }
 

diff --git a/files/source/scale_msa.cc b/files/source/scale_msa.cc
new file mode 100644
index 0000000..bfcd10f
--- /dev/null
+++ b/files/source/scale_msa.cc

@@ -0,0 +1,553 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/scale_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    ST_UB(dst0, dst_argb);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
+    ST_UB(dst0, dst_argb);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  const uint8_t* s = src_argb;
+  const uint8_t* t = src_argb + src_stride;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+  v8u16 reg0, reg1, reg2, reg3;
+  v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+    vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
+    vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
+    reg0 = __msa_hadd_u_h(vec0, vec0);
+    reg1 = __msa_hadd_u_h(vec1, vec1);
+    reg2 = __msa_hadd_u_h(vec2, vec2);
+    reg3 = __msa_hadd_u_h(vec3, vec3);
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
+    reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_argb);
+    s += 32;
+    t += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  int32_t stepx = src_stepx * 4;
+  int32_t data0, data1, data2, data3;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    data0 = LW(src_argb);
+    data1 = LW(src_argb + stepx);
+    data2 = LW(src_argb + stepx * 2);
+    data3 = LW(src_argb + stepx * 3);
+    SW(data0, dst_argb);
+    SW(data1, dst_argb + 4);
+    SW(data2, dst_argb + 8);
+    SW(data3, dst_argb + 12);
+    src_argb += stepx * 4;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8* dst_argb,
+                                 int dst_width) {
+  int x;
+  const uint8* nxt_argb = src_argb + src_stride;
+  int32_t stepx = src_stepx * 4;
+  int64_t data0, data1, data2, data3;
+  v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
+  v16u8 vec0, vec1, vec2, vec3;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 dst0;
+
+  for (x = 0; x < dst_width; x += 4) {
+    data0 = LD(src_argb);
+    data1 = LD(src_argb + stepx);
+    data2 = LD(src_argb + stepx * 2);
+    data3 = LD(src_argb + stepx * 3);
+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
+    data0 = LD(nxt_argb);
+    data1 = LD(nxt_argb + stepx);
+    data2 = LD(nxt_argb + stepx * 2);
+    data3 = LD(nxt_argb + stepx * 3);
+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    reg0 = __msa_hadd_u_h(vec0, vec0);
+    reg1 = __msa_hadd_u_h(vec1, vec1);
+    reg2 = __msa_hadd_u_h(vec2, vec2);
+    reg3 = __msa_hadd_u_h(vec3, vec3);
+    reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
+    reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
+    reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
+    reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
+    reg4 += reg6;
+    reg5 += reg7;
+    reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
+    reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    ST_UB(dst0, dst_argb);
+    src_argb += stepx * 4;
+    nxt_argb += stepx * 4;
+    dst_argb += 16;
+  }
+}
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = __msa_aver_u_b(vec1, vec0);
+    dst1 = __msa_aver_u_b(vec3, vec2);
+    ST_UB2(dst0, dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = __msa_hadd_u_h(src0, src0);
+    vec1 = __msa_hadd_u_h(src1, src1);
+    vec2 = __msa_hadd_u_h(src2, src2);
+    vec3 = __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    ST_UB2(dst0, dst1, dst, 16);
+    s += 64;
+    t += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst);
+    src_ptr += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t0 = s + src_stride;
+  const uint8_t* t1 = s + src_stride * 2;
+  const uint8_t* t2 = s + src_stride * 3;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v4u32 reg0, reg1, reg2, reg3;
+
+  for (x = 0; x < dst_width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
+    vec0 = __msa_hadd_u_h(src0, src0);
+    vec1 = __msa_hadd_u_h(src1, src1);
+    vec2 = __msa_hadd_u_h(src2, src2);
+    vec3 = __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
+    vec0 += __msa_hadd_u_h(src0, src0);
+    vec1 += __msa_hadd_u_h(src1, src1);
+    vec2 += __msa_hadd_u_h(src2, src2);
+    vec3 += __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    reg0 = __msa_hadd_u_w(vec0, vec0);
+    reg1 = __msa_hadd_u_w(vec1, vec1);
+    reg2 = __msa_hadd_u_w(vec2, vec2);
+    reg3 = __msa_hadd_u_w(vec3, vec3);
+    reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
+    reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
+    reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
+    reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst);
+    s += 64;
+    t0 += 64;
+    t1 += 64;
+    t2 += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  int x, width;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, vec0;
+  v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+  (void)src_stride;
+
+  assert(dst_width % 3 == 0);
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
+    dst0 = __msa_copy_u_d((v2i64)vec0, 0);
+    dst1 = __msa_copy_u_w((v4i32)vec0, 2);
+    SD(dst0, dst);
+    SW(dst1, dst + 8);
+    src_ptr += 32;
+    dst += 12;
+  }
+}
+
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, width;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, src2, src3, out;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  v8i16 zero = {0};
+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+  v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
+    vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
+    vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
+    vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    tmp0 = __msa_hadd_u_w(vec4, vec4);
+    tmp1 = __msa_hadd_u_w(vec5, vec5);
+    tmp2 = __msa_hadd_u_w(vec6, vec6);
+    tmp3 = __msa_hadd_u_w(vec7, vec7);
+    tmp4 = __msa_hadd_u_w(vec0, vec0);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    tmp0 = __msa_hadd_u_w(vec0, vec0);
+    tmp1 = __msa_hadd_u_w(vec1, vec1);
+    tmp0 *= const_0x2AAA;
+    tmp1 *= const_0x2AAA;
+    tmp4 *= const_0x4000;
+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+    dst0 = __msa_copy_u_d((v2i64)out, 0);
+    dst1 = __msa_copy_u_w((v4i32)out, 2);
+    SD(dst0, dst_ptr);
+    SW(dst1, dst_ptr + 8);
+    s += 32;
+    t += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, width;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t0 = s + src_stride;
+  const uint8_t* t1 = s + src_stride * 2;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, src2, src3, src4, src5, out;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  v8u16 zero = {0};
+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+  v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
+    vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
+    vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
+    vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    tmp0 = __msa_hadd_u_w(vec4, vec4);
+    tmp1 = __msa_hadd_u_w(vec5, vec5);
+    tmp2 = __msa_hadd_u_w(vec6, vec6);
+    tmp3 = __msa_hadd_u_w(vec7, vec7);
+    tmp4 = __msa_hadd_u_w(vec0, vec0);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    tmp0 = __msa_hadd_u_w(vec0, vec0);
+    tmp1 = __msa_hadd_u_w(vec1, vec1);
+    tmp0 *= const_0x1C71;
+    tmp1 *= const_0x1C71;
+    tmp4 *= const_0x2AAA;
+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+    dst0 = __msa_copy_u_d((v2i64)out, 0);
+    dst1 = __msa_copy_u_w((v4i32)out, 2);
+    SD(dst0, dst_ptr);
+    SW(dst1, dst_ptr + 8);
+    s += 32;
+    t0 += 32;
+    t1 += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  int x;
+  v16u8 src0;
+  v8u16 dst0, dst1;
+  v16i8 zero = {0};
+
+  assert(src_width > 0);
+
+  for (x = 0; x < src_width; x += 16) {
+    src0 = LD_UB(src_ptr);
+    dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
+    dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
+    dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    ST_UH2(dst0, dst1, dst_ptr, 8);
+    src_ptr += 16;
+    dst_ptr += 16;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
index 44b0c80..9b4dce3 100644
--- a/files/source/scale_neon.cc
+++ b/files/source/scale_neon.cc

@@ -23,8 +23,11 @@
 // Provided by Fritz Koenig
 
 // Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
+void ScaleRowDown2_NEON(const uint8* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8* dst,
+                        int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     // load even pixels into q0, odd into q1
@@ -43,8 +46,11 @@
 }
 
 // Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8* dst,
+                              int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -66,8 +72,10 @@
 }
 
 // Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
+void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8* dst,
+                           int dst_width) {
   asm volatile (
     // change the stride to row 2 pointer
     "add        %1, %0                         \n"
@@ -95,8 +103,11 @@
   );
 }
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4_NEON(const uint8* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -113,12 +124,14 @@
   );
 }
 
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8* dst_ptr,
+                           int dst_width) {
   const uint8* src_ptr1 = src_ptr + src_stride;
   const uint8* src_ptr2 = src_ptr + src_stride * 2;
   const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
+  asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
@@ -155,7 +168,9 @@
 // Point samples 32 pixels to 24 pixels.
 void ScaleRowDown34_NEON(const uint8* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
+                         uint8* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -175,7 +190,8 @@
 
 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr,
+                               int dst_width) {
   asm volatile (
     "vmov.u8    d24, #3                        \n"
     "add        %3, %0                         \n"
@@ -234,7 +250,8 @@
 
 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr,
+                               int dst_width) {
   asm volatile (
     "vmov.u8    d24, #3                        \n"
     "add        %3, %0                         \n"
@@ -274,21 +291,20 @@
 }
 
 #define HAS_SCALEROWDOWN38_NEON
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+static uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,
+                          18, 6, 14, 19, 0,  0,  0, 0};
+static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+                             65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
+static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+                             65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
 
 // 32 -> 12
 void ScaleRowDown38_NEON(const uint8* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
+                         uint8* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
   asm volatile (
     MEMACCESS(3)
     "vld1.8     {q3}, [%3]                     \n"
@@ -314,7 +330,8 @@
 // 32x3 -> 12x1
 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
+                                      uint8* dst_ptr,
+                                      int dst_width) {
   const uint8* src_ptr1 = src_ptr + src_stride * 2;
 
   asm volatile (
@@ -433,7 +450,8 @@
 // 32x2 -> 12x1
 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr,
+                               int dst_width) {
   asm volatile (
     MEMACCESS(4)
     "vld1.16    {q13}, [%4]                    \n"
@@ -530,8 +548,11 @@
   );
 }
 
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
+void ScaleAddRows_NEON(const uint8* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint16* dst_ptr,
+                       int src_width,
+                       int src_height) {
   const uint8* src_tmp;
   asm volatile (
   "1:                                          \n"
@@ -563,6 +584,7 @@
   );
 }
 
+// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD2_DATA8_LANE(n)                                    \
@@ -571,13 +593,17 @@
     "add        %3, %3, %4                     \n"             \
     MEMACCESS(6)                                               \
     "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+// clang-format on
 
-// The NEON version mimics this formula:
+// The NEON version mimics this formula (from row_common.cc):
 // #define BLENDER(a, b, f) (uint8)((int)(a) +
-//    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
+void ScaleFilterCols_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
   const uint8* src_tmp = src_ptr;
@@ -640,8 +666,10 @@
 
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
+                          const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
   asm volatile (
     "cmp          %4, #0                       \n"
     "beq          100f                         \n"
@@ -737,8 +765,11 @@
   );
 }
 
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst,
+                            int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     // load even pixels into q0, odd into q1
@@ -760,8 +791,11 @@
   );
 }
 
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -788,8 +822,10 @@
   );
 }
 
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst,
+                               int dst_width) {
   asm volatile (
     // change the stride to row 2 pointer
     "add        %1, %1, %0                     \n"
@@ -829,8 +865,12 @@
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
   asm volatile (
     "mov        r12, %3, lsl #2                \n"
   "1:                                          \n"
@@ -856,9 +896,11 @@
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+                                  uint8* dst_argb,
+                                  int dst_width) {
   asm volatile (
     "mov        r12, %4, lsl #2                \n"
     "add        %1, %1, %0                     \n"
@@ -902,17 +944,22 @@
   );
 }
 
+// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
+#define LOAD1_DATA32_LANE(dn, n)                            \
+  "lsr        %5, %3, #16                    \n"            \
+  "add        %6, %1, %5, lsl #2             \n"            \
+  "add        %3, %3, %4                     \n"            \
+  MEMACCESS(6)                                              \
+  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
+// clang-format on
 
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+void ScaleARGBCols_NEON(uint8* dst_argb,
+                        const uint8* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
   int tmp;
   const uint8* src_tmp = src_argb;
   asm volatile (
@@ -944,17 +991,22 @@
 
 #undef LOAD1_DATA32_LANE
 
+// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                             \
+  "lsr        %5, %3, #16                           \n"            \
+  "add        %6, %1, %5, lsl #2                    \n"            \
+  "add        %3, %3, %4                            \n"            \
+  MEMACCESS(6)                                                     \
+  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6]    \n"
+// clang-format on
 
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_NEON(uint8* dst_argb,
+                              const uint8* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
   const uint8* src_tmp = src_argb;

diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc
index ff277f2..a98b9d0 100644
--- a/files/source/scale_neon64.cc
+++ b/files/source/scale_neon64.cc

@@ -21,8 +21,11 @@
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
+void ScaleRowDown2_NEON(const uint8* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8* dst,
+                        int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     // load even pixels into v0, odd into v1
@@ -41,8 +44,11 @@
 }
 
 // Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8* dst,
+                              int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -64,8 +70,10 @@
 }
 
 // Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
+void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8* dst,
+                           int dst_width) {
   asm volatile (
     // change the stride to row 2 pointer
     "add        %1, %1, %0                     \n"
@@ -93,8 +101,11 @@
   );
 }
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4_NEON(const uint8* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -111,12 +122,14 @@
   );
 }
 
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8* dst_ptr,
+                           int dst_width) {
   const uint8* src_ptr1 = src_ptr + src_stride;
   const uint8* src_ptr2 = src_ptr + src_stride * 2;
   const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
+  asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
     "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
@@ -152,15 +165,17 @@
 // Point samples 32 pixels to 24 pixels.
 void ScaleRowDown34_NEON(const uint8* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
+                         uint8* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                                  \n"
     MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
     "subs      %w2, %w2, #24                           \n"
     "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
     MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
     "b.gt      1b                                      \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
@@ -172,15 +187,16 @@
 
 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr,
+                               int dst_width) {
   asm volatile (
     "movi      v20.8b, #3                              \n"
     "add       %3, %3, %0                              \n"
   "1:                                                  \n"
     MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
     MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
     "subs         %w2, %w2, #24                        \n"
 
     // filter src line 0 with src line 1
@@ -232,7 +248,8 @@
 
 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr,
+                               int dst_width) {
   asm volatile (
     "movi      v20.8b, #3                              \n"
     "add       %3, %3, %0                              \n"
@@ -273,29 +290,28 @@
   );
 }
 
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+static uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
+                          34, 6,  22, 35, 0,  0,  0, 0};
+static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+                             65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
+static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+                             65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
 
 // 32 -> 12
 void ScaleRowDown38_NEON(const uint8* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
+                         uint8* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
   asm volatile (
     MEMACCESS(3)
     "ld1       {v3.16b}, [%3]                          \n"
   "1:                                                  \n"
     MEMACCESS(0)
-    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
+    "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
     "subs      %w2, %w2, #12                           \n"
-    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
+    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
     MEMACCESS(1)
     "st1       {v2.8b}, [%1], #8                       \n"
     MEMACCESS(1)
@@ -312,7 +328,8 @@
 // 32x3 -> 12x1
 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
+                                      uint8* dst_ptr,
+                                      int dst_width) {
   const uint8* src_ptr1 = src_ptr + src_stride * 2;
   ptrdiff_t tmp_src_stride = src_stride;
 
@@ -441,7 +458,8 @@
 // 32x2 -> 12x1
 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr,
+                               int dst_width) {
   // TODO(fbarchard): use src_stride directly for clang 3.5+.
   ptrdiff_t tmp_src_stride = src_stride;
   asm volatile (
@@ -545,8 +563,11 @@
   );
 }
 
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
+void ScaleAddRows_NEON(const uint8* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint16* dst_ptr,
+                       int src_width,
+                       int src_height) {
   const uint8* src_tmp;
   asm volatile (
   "1:                                          \n"
@@ -578,23 +599,32 @@
   );
 }
 
+// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                    \n"              \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
+#define LOAD2_DATA8_LANE(n)                                 \
+  "lsr        %5, %3, #16                    \n"            \
+  "add        %6, %1, %5                     \n"            \
+  "add        %3, %3, %4                     \n"            \
+  MEMACCESS(6)                                              \
+  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
+// clang-format on
 
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
   const uint8* src_tmp = src_ptr;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
+  int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64)x;
+  int64 dx64 = (int64)dx;
   asm volatile (
     "dup        v0.4s, %w3                     \n"  // x
     "dup        v1.4s, %w4                     \n"  // dx
@@ -626,8 +656,8 @@
     "ushll2    v6.4s, v6.8h, #0                \n"
     "mul       v16.4s, v16.4s, v7.4s           \n"
     "mul       v17.4s, v17.4s, v6.4s           \n"
-    "rshrn      v6.4h, v16.4s, #16             \n"
-    "rshrn2     v6.8h, v17.4s, #16             \n"
+    "rshrn     v6.4h, v16.4s, #16              \n"
+    "rshrn2    v6.8h, v17.4s, #16              \n"
     "add       v4.8h, v4.8h, v6.8h             \n"
     "xtn       v4.8b, v4.8h                    \n"
 
@@ -654,9 +684,11 @@
 
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-    int y_fraction = 256 - source_y_fraction;
+                          const uint8* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  int y_fraction = 256 - source_y_fraction;
   asm volatile (
     "cmp          %w4, #0                      \n"
     "b.eq         100f                         \n"
@@ -752,8 +784,11 @@
   );
 }
 
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst,
+                            int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     // load even pixels into q0, odd into q1
@@ -775,8 +810,11 @@
   );
 }
 
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     MEMACCESS (0)
@@ -802,8 +840,10 @@
   );
 }
 
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst,
+                               int dst_width) {
   asm volatile (
     // change the stride to row 2 pointer
     "add        %1, %1, %0                     \n"
@@ -839,8 +879,12 @@
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
@@ -867,9 +911,11 @@
 // Alignment requirement: src_argb 4 byte aligned.
 // TODO(Yang Zhang): Might be worth another optimization pass in future.
 // It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+                                  uint8* dst_argb,
+                                  int dst_width) {
   asm volatile (
     "add        %1, %1, %0                     \n"
   "1:                                          \n"
@@ -916,21 +962,26 @@
   );
 }
 
+// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld1        {"#vn".s}["#n"], [%6]          \n"
+#define LOAD1_DATA32_LANE(vn, n)                            \
+  "lsr        %5, %3, #16                    \n"            \
+  "add        %6, %1, %5, lsl #2             \n"            \
+  "add        %3, %3, %4                     \n"            \
+  MEMACCESS(6)                                              \
+ "ld1        {" #vn ".s}[" #n "], [%6]       \n"
+// clang-format on
 
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+void ScaleARGBCols_NEON(uint8* dst_argb,
+                        const uint8* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
   const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
+  int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64)x;
+  int64 dx64 = (int64)dx;
   int64 tmp64;
   asm volatile (
   "1:                                          \n"
@@ -961,23 +1012,28 @@
 
 #undef LOAD1_DATA32_LANE
 
+// clang-format off
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                             \
+  "lsr        %5, %3, #16                           \n"            \
+  "add        %6, %1, %5, lsl #2                    \n"            \
+  "add        %3, %3, %4                            \n"            \
+  MEMACCESS(6)                                                     \
+  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
+// clang-format on
 
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_NEON(uint8* dst_argb,
+                              const uint8* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
   const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
+  int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64)x;
+  int64 dx64 = (int64)dx;
   asm volatile (
     "dup        v0.4s, %w3                     \n"  // x
     "dup        v1.4s, %w4                     \n"  // dx

diff --git a/files/source/scale_win.cc b/files/source/scale_win.cc
index f170973..0c5b3a1 100644
--- a/files/source/scale_win.cc
+++ b/files/source/scale_win.cc

@@ -20,94 +20,89 @@
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 
 // Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                       128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                       128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                       128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
 
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                        10, 11, 12, 13, 13, 14, 14, 15};
 
 // Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
 
 // Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
 
 // Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
 
 // Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
+static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
 
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                         128, 128, 128, 128, 128, 128, 128, 128};
 
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                         6,   8,   11,  14,  128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                        128, 128, 128, 128, 128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                         6,   7,   12,  13,  128, 128, 128, 128};
 
 // Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                            65536 / 9, 65536 / 6, 0,         0};
 
 // Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                         11, 128, 14, 128, 128, 128, 128, 128};
 
 // Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                         12, 128, 15, 128, 128, 128, 128, 128};
 
 // Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                         13, 128, 128, 128, 128, 128, 128, 128};
 
 // Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                           65536 / 3, 65536 / 2, 0,         0};
 
 // Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked)
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8* dst_ptr,
+                                           int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm0, 8          // isolate odd pixels.
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -120,27 +115,28 @@
 }
 
 // Blends 32x1 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+                                                 ptrdiff_t src_stride,
+                                                 uint8* dst_ptr,
+                                                 int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
     psrlw      xmm4, 15
     packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5            // constant 0
+    pxor       xmm5, xmm5  // constant 0
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
-    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm0, xmm5       // (x + 1) / 2
     pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -153,20 +149,21 @@
 }
 
 // Blends 32x2 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8* dst_ptr,
+                                              int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
 
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
     psrlw      xmm4, 15
     packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5            // constant 0
+    pxor       xmm5, xmm5  // constant 0
 
   wloop:
     movdqu     xmm0, [eax]
@@ -174,15 +171,15 @@
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // vertical add
+    paddw      xmm0, xmm2  // vertical add
     paddw      xmm1, xmm3
     psrlw      xmm0, 1
     psrlw      xmm1, 1
-    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm0, xmm5  // (x + 1) / 2
     pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -197,23 +194,24 @@
 
 #ifdef HAS_SCALEROWDOWN2_AVX2
 // Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked)
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8* dst_ptr,
+                                          int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     vmovdqu     ymm0, [eax]
     vmovdqu     ymm1, [eax + 32]
     lea         eax,  [eax + 64]
-    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
     vpsrlw      ymm1, ymm1, 8
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -225,30 +223,31 @@
 }
 
 // Blends 64x1 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+                                                ptrdiff_t src_stride,
+                                                uint8* dst_ptr,
+                                                int dst_width) {
   __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
 
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
     vpsrlw      ymm4, ymm4, 15
     vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
+    vpxor       ymm5, ymm5, ymm5  // constant 0
 
   wloop:
     vmovdqu     ymm0, [eax]
     vmovdqu     ymm1, [eax + 32]
     lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -262,20 +261,21 @@
 // For rounding, average = (sum + 2) / 4
 // becomes average((sum >> 1), 0)
 // Blends 64x2 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8* dst_ptr,
+                                             int dst_width) {
   __asm {
     push        esi
-    mov         eax, [esp + 4 + 4]    // src_ptr
-    mov         esi, [esp + 4 + 8]    // src_stride
-    mov         edx, [esp + 4 + 12]   // dst_ptr
-    mov         ecx, [esp + 4 + 16]   // dst_width
+    mov         eax, [esp + 4 + 4]  // src_ptr
+    mov         esi, [esp + 4 + 8]  // src_stride
+    mov         edx, [esp + 4 + 12]  // dst_ptr
+    mov         ecx, [esp + 4 + 16]  // dst_width
 
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
     vpsrlw      ymm4, ymm4, 15
     vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
+    vpxor       ymm5, ymm5, ymm5  // constant 0
 
   wloop:
     vmovdqu     ymm0, [eax]
@@ -283,18 +283,18 @@
     vmovdqu     ymm2, [eax + esi]
     vmovdqu     ymm3, [eax + esi + 32]
     lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // vertical add
+    vpaddw      ymm0, ymm0, ymm2  // vertical add
     vpaddw      ymm1, ymm1, ymm3
-    vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2
+    vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
     vpsrlw      ymm1, ymm1, 1
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -308,15 +308,16 @@
 #endif  // HAS_SCALEROWDOWN2_AVX2
 
 // Point samples 32 pixels to 8 pixels.
-__declspec(naked)
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8* dst_ptr,
+                                           int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
     psrld      xmm5, 24
     pslld      xmm5, 16
 
@@ -339,50 +340,51 @@
 }
 
 // Blends 32x4 rectangle to 8x1.
-__declspec(naked)
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8* dst_ptr,
+                                              int dst_width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_ptr
-    mov        esi, [esp + 8 + 8]    // src_stride
-    mov        edx, [esp + 8 + 12]   // dst_ptr
-    mov        ecx, [esp + 8 + 16]   // dst_width
+    mov        eax, [esp + 8 + 4]  // src_ptr
+    mov        esi, [esp + 8 + 8]  // src_stride
+    mov        edx, [esp + 8 + 12]  // dst_ptr
+    mov        ecx, [esp + 8 + 16]  // dst_width
     lea        edi, [esi + esi * 2]  // src_stride * 3
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
     psrlw      xmm4, 15
     movdqa     xmm5, xmm4
     packuswb   xmm4, xmm4
-    psllw      xmm5, 3               // constant 0x0008
+    psllw      xmm5, 3  // constant 0x0008
 
   wloop:
-    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm0, [eax]  // average rows
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
-    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // vertical add rows 0, 1
+    paddw      xmm0, xmm2  // vertical add rows 0, 1
     paddw      xmm1, xmm3
     movdqu     xmm2, [eax + esi * 2]
     movdqu     xmm3, [eax + esi * 2 + 16]
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // add row 2
+    paddw      xmm0, xmm2  // add row 2
     paddw      xmm1, xmm3
     movdqu     xmm2, [eax + edi]
     movdqu     xmm3, [eax + edi + 16]
     lea        eax, [eax + 32]
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // add row 3
+    paddw      xmm0, xmm2  // add row 3
     paddw      xmm1, xmm3
     phaddw     xmm0, xmm1
-    paddw      xmm0, xmm5      // + 8 for round
-    psrlw      xmm0, 4         // /16 for average of 4 * 4
+    paddw      xmm0, xmm5  // + 8 for round
+    psrlw      xmm0, 4  // /16 for average of 4 * 4
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
@@ -397,15 +399,16 @@
 
 #ifdef HAS_SCALEROWDOWN4_AVX2
 // Point samples 64 pixels to 16 pixels.
-__declspec(naked)
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8* dst_ptr,
+                                          int dst_width) {
   __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride ignored
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
     vpsrld      ymm5, ymm5, 24
     vpslld      ymm5, ymm5, 16
 
@@ -416,10 +419,10 @@
     vpand       ymm0, ymm0, ymm5
     vpand       ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vpsrlw      ymm0, ymm0, 8
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -431,52 +434,53 @@
 }
 
 // Blends 64x4 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8* dst_ptr,
+                                             int dst_width) {
   __asm {
     push        esi
     push        edi
-    mov         eax, [esp + 8 + 4]    // src_ptr
-    mov         esi, [esp + 8 + 8]    // src_stride
-    mov         edx, [esp + 8 + 12]   // dst_ptr
-    mov         ecx, [esp + 8 + 16]   // dst_width
+    mov         eax, [esp + 8 + 4]  // src_ptr
+    mov         esi, [esp + 8 + 8]  // src_stride
+    mov         edx, [esp + 8 + 12]  // dst_ptr
+    mov         ecx, [esp + 8 + 16]  // dst_width
     lea         edi, [esi + esi * 2]  // src_stride * 3
-    vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101
+    vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
     vpsrlw      ymm4, ymm4, 15
-    vpsllw      ymm5, ymm4, 3               // constant 0x0008
+    vpsllw      ymm5, ymm4, 3  // constant 0x0008
     vpackuswb   ymm4, ymm4, ymm4
 
   wloop:
-    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm0, [eax]  // average rows
     vmovdqu     ymm1, [eax + 32]
     vmovdqu     ymm2, [eax + esi]
     vmovdqu     ymm3, [eax + esi + 32]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1
+    vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
     vpaddw      ymm1, ymm1, ymm3
     vmovdqu     ymm2, [eax + esi * 2]
     vmovdqu     ymm3, [eax + esi * 2 + 32]
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // add row 2
+    vpaddw      ymm0, ymm0, ymm2  // add row 2
     vpaddw      ymm1, ymm1, ymm3
     vmovdqu     ymm2, [eax + edi]
     vmovdqu     ymm3, [eax + edi + 32]
     lea         eax,  [eax + 64]
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // add row 3
+    vpaddw      ymm0, ymm0, ymm2  // add row 3
     vpaddw      ymm1, ymm1, ymm3
-    vphaddw     ymm0, ymm0, ymm1      // mutates
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw
-    vpaddw      ymm0, ymm0, ymm5      // + 8 for round
-    vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4
+    vphaddw     ymm0, ymm0, ymm1  // mutates
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
+    vpaddw      ymm0, ymm0, ymm5  // + 8 for round
+    vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -494,14 +498,15 @@
 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
 
-__declspec(naked)
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8* dst_ptr,
+                                            int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]   // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
     movdqa     xmm3, xmmword ptr kShuf0
     movdqa     xmm4, xmmword ptr kShuf1
     movdqa     xmm5, xmmword ptr kShuf2
@@ -541,16 +546,16 @@
 // xmm7 kRound34
 
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShuf01
     movdqa     xmm3, xmmword ptr kShuf11
     movdqa     xmm4, xmmword ptr kShuf21
@@ -559,7 +564,7 @@
     movdqa     xmm7, xmmword ptr kRound34
 
   wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]  // pixels 0..7
     movdqu     xmm1, [eax + esi]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
@@ -568,7 +573,7 @@
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
     movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm3
@@ -577,7 +582,7 @@
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
     movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm0, xmm1
@@ -598,16 +603,16 @@
 }
 
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShuf01
     movdqa     xmm3, xmmword ptr kShuf11
     movdqa     xmm4, xmmword ptr kShuf21
@@ -616,7 +621,7 @@
     movdqa     xmm7, xmmword ptr kRound34
 
   wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]  // pixels 0..7
     movdqu     xmm1, [eax + esi]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -626,7 +631,7 @@
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
     movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -636,7 +641,7 @@
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
     movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm1, xmm0
@@ -660,26 +665,27 @@
 // 3/8 point sampler
 
 // Scale 32 pixels to 12
-__declspec(naked)
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8* dst_ptr,
+                                            int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
     movdqa     xmm4, xmmword ptr kShuf38a
     movdqa     xmm5, xmmword ptr kShuf38b
 
   xloop:
-    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
     lea        eax, [eax + 32]
     pshufb     xmm0, xmm4
     pshufb     xmm1, xmm5
     paddusb    xmm0, xmm1
 
-    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movq       qword ptr [edx], xmm0       // write 12 pixels
     movhlps    xmm1, xmm0
     movd       [edx + 8], xmm1
     lea        edx, [edx + 12]
@@ -691,23 +697,23 @@
 }
 
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShufAc
     movdqa     xmm3, xmmword ptr kShufAc3
     movdqa     xmm4, xmmword ptr kScaleAc33
     pxor       xmm5, xmm5
 
   xloop:
-    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
     movdqu     xmm6, [eax + esi]
     movhlps    xmm1, xmm0
     movhlps    xmm7, xmm6
@@ -725,14 +731,14 @@
     paddusw    xmm0, xmm6
     paddusw    xmm1, xmm7
 
-    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
     psrldq     xmm0, 2
     paddusw    xmm6, xmm0
     psrldq     xmm0, 2
     paddusw    xmm6, xmm0
     pshufb     xmm6, xmm2
 
-    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
     psrldq     xmm1, 2
     paddusw    xmm7, xmm1
     psrldq     xmm1, 2
@@ -740,10 +746,10 @@
     pshufb     xmm7, xmm3
     paddusw    xmm6, xmm7
 
-    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
     packuswb   xmm6, xmm6
 
-    movd       [edx], xmm6           // write 6 pixels
+    movd       [edx], xmm6  // write 6 pixels
     psrlq      xmm6, 16
     movd       [edx + 2], xmm6
     lea        edx, [edx + 6]
@@ -756,28 +762,28 @@
 }
 
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShufAb0
     movdqa     xmm3, xmmword ptr kShufAb1
     movdqa     xmm4, xmmword ptr kShufAb2
     movdqa     xmm5, xmmword ptr kScaleAb2
 
   xloop:
-    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm0, [eax]  // average 2 rows into xmm0
     movdqu     xmm1, [eax + esi]
     lea        eax, [eax + 16]
     pavgb      xmm0, xmm1
 
-    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
     pshufb     xmm1, xmm2
     movdqa     xmm6, xmm0
     pshufb     xmm6, xmm3
@@ -785,10 +791,10 @@
     pshufb     xmm0, xmm4
     paddusw    xmm1, xmm0
 
-    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
     packuswb   xmm1, xmm1
 
-    movd       [edx], xmm1           // write 6 pixels
+    movd       [edx], xmm1  // write 6 pixels
     psrlq      xmm1, 16
     movd       [edx + 2], xmm1
     lea        edx, [edx + 6]
@@ -801,26 +807,27 @@
 }
 
 // Reads 16 bytes and accumulates to 16 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
+                                        uint16* dst_ptr,
+                                        int src_width) {
   __asm {
-    mov        eax, [esp + 4]   // src_ptr
-    mov        edx, [esp + 8]   // dst_ptr
+    mov        eax, [esp + 4]  // src_ptr
+    mov        edx, [esp + 8]  // dst_ptr
     mov        ecx, [esp + 12]  // src_width
     pxor       xmm5, xmm5
 
-  // sum rows
+    // sum rows
   xloop:
-    movdqu     xmm3, [eax]       // read 16 bytes
+    movdqu     xmm3, [eax]  // read 16 bytes
     lea        eax, [eax + 16]
-    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm0, [edx]  // read 16 words from destination
     movdqu     xmm1, [edx + 16]
     movdqa     xmm2, xmm3
     punpcklbw  xmm2, xmm5
     punpckhbw  xmm3, xmm5
-    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm0, xmm2  // sum 16 words
     paddusw    xmm1, xmm3
-    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx], xmm0  // write 16 words to destination
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 16
@@ -831,24 +838,25 @@
 
 #ifdef HAS_SCALEADDROW_AVX2
 // Reads 32 bytes and accumulates to 32 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
+                                        uint16* dst_ptr,
+                                        int src_width) {
   __asm {
-    mov         eax, [esp + 4]   // src_ptr
-    mov         edx, [esp + 8]   // dst_ptr
+    mov         eax, [esp + 4]  // src_ptr
+    mov         edx, [esp + 8]  // dst_ptr
     mov         ecx, [esp + 12]  // src_width
     vpxor       ymm5, ymm5, ymm5
 
-  // sum rows
+    // sum rows
   xloop:
-    vmovdqu     ymm3, [eax]       // read 32 bytes
+    vmovdqu     ymm3, [eax]  // read 32 bytes
     lea         eax, [eax + 32]
     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
     vpunpcklbw  ymm2, ymm3, ymm5
     vpunpckhbw  ymm3, ymm3, ymm5
-    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
     vpaddusw    ymm1, ymm3, [edx + 32]
-    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx], ymm0  // write 32 words to destination
     vmovdqu     [edx + 32], ymm1
     lea         edx, [edx + 64]
     sub         ecx, 32
@@ -862,86 +870,87 @@
 
 // Constant for making pixels signed to avoid pmaddubsw
 // saturation.
-static uvec8 kFsub80 =
-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
 
 // Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
-  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                         0x4040, 0x4040, 0x4040, 0x4040};
 
 // Bilinear column filtering. SSSE3 version.
-__declspec(naked)
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
+__declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
+                                             const uint8* src_ptr,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        edi, [esp + 12 + 4]    // dst_ptr
-    mov        esi, [esp + 12 + 8]    // src_ptr
-    mov        ecx, [esp + 12 + 12]   // dst_width
+    mov        edi, [esp + 12 + 4]  // dst_ptr
+    mov        esi, [esp + 12 + 8]  // src_ptr
+    mov        ecx, [esp + 12 + 12]  // dst_width
     movd       xmm2, [esp + 12 + 16]  // x
     movd       xmm3, [esp + 12 + 20]  // dx
-    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
     movd       xmm5, eax
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
     psrlw      xmm6, 9
-    pcmpeqb    xmm7, xmm7           // generate 0x0001
+    pcmpeqb    xmm7, xmm7  // generate 0x0001
     psrlw      xmm7, 15
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29
 
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
     paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
 
     // 2 Pixel loop.
   xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
     movd       xmm0, ebx
-    psrlw      xmm1, 9              // 7 bit fractions.
+    psrlw      xmm1, 9  // 7 bit fractions.
     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
     movd       xmm4, ebx
-    pshufb     xmm1, xmm5           // 0011
+    pshufb     xmm1, xmm5  // 0011
     punpcklwd  xmm0, xmm4
     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    paddusb    xmm1, xmm7           // +1 so 0..7f and 80..1
-    pmaddubsw  xmm1, xmm0           // 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
     paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm1, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm1, xmm1           // 8 bits, 2 pixels.
+    psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
     movd       ebx, xmm1
     mov        [edi], bx
     lea        edi, [edi + 2]
-    sub        ecx, 2               // 2 pixels
+    sub        ecx, 2  // 2 pixels
     jge        xloop2
 
  xloop29:
     add        ecx, 2 - 1
     jl         xloop99
 
-    // 1 pixel remainder
+        // 1 pixel remainder
     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
     movd       xmm0, ebx
-    psrlw      xmm2, 9              // 7 bit fractions.
-    pshufb     xmm2, xmm5           // 0011
+    psrlw      xmm2, 9  // 7 bit fractions.
+    pshufb     xmm2, xmm5  // 0011
     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    paddusb    xmm2, xmm7           // +1 so 0..7f and 80..1
-    pmaddubsw  xmm2, xmm0           // 16 bit
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm2, xmm0  // 16 bit
     paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm2, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm2, xmm2           // 8 bits
+    psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm2, xmm2  // 8 bits
     movd       ebx, xmm2
     mov        [edi], bl
 
@@ -955,13 +964,15 @@
 }
 
 // Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked)
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
+__declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr,
+                                         const uint8* src_ptr,
+                                         int dst_width,
+                                         int x,
+                                         int dx) {
   __asm {
-    mov        edx, [esp + 4]    // dst_ptr
-    mov        eax, [esp + 8]    // src_ptr
-    mov        ecx, [esp + 12]   // dst_width
+    mov        edx, [esp + 4]  // dst_ptr
+    mov        eax, [esp + 8]  // src_ptr
+    mov        ecx, [esp + 12]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -980,15 +991,15 @@
 }
 
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked)
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                                              ptrdiff_t src_stride,
+                                              uint8* dst_argb,
+                                              int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]   // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1005,23 +1016,23 @@
 }
 
 // Blends 8x1 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    uint8* dst_argb,
+                                                    int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd       // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1033,16 +1044,16 @@
 }
 
 // Blends 8x2 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 uint8* dst_argb,
+                                                 int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_argb
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_argb
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1050,11 +1061,11 @@
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm0, xmm2  // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1067,18 +1078,19 @@
 }
 
 // Reads 4 pixels at a time.
-__declspec(naked)
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 int src_stepx,
+                                                 uint8* dst_argb,
+                                                 int dst_width) {
   __asm {
     push       ebx
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_argb
-                                     // src_stride ignored
-    mov        ebx, [esp + 8 + 12]   // src_stepx
-    mov        edx, [esp + 8 + 16]   // dst_argb
-    mov        ecx, [esp + 8 + 20]   // dst_width
+    mov        eax, [esp + 8 + 4]   // src_argb
+    // src_stride ignored
+    mov        ebx, [esp + 8 + 12]  // src_stepx
+    mov        edx, [esp + 8 + 16]  // dst_argb
+    mov        ecx, [esp + 8 + 20]  // dst_width
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
@@ -1103,21 +1115,21 @@
 }
 
 // Blends four 2x2 to 4x1.
-__declspec(naked)
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    int src_stepx,
+                                                    uint8* dst_argb,
+                                                    int dst_width) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]    // src_argb
-    mov        esi, [esp + 12 + 8]    // src_stride
-    mov        ebx, [esp + 12 + 12]   // src_stepx
-    mov        edx, [esp + 12 + 16]   // dst_argb
-    mov        ecx, [esp + 12 + 20]   // dst_width
-    lea        esi, [eax + esi]       // row1 pointer
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        esi, [esp + 12 + 8]  // src_stride
+    mov        ebx, [esp + 12 + 12]  // src_stepx
+    mov        edx, [esp + 12 + 16]  // dst_argb
+    mov        ecx, [esp + 12 + 20]  // dst_width
+    lea        esi, [eax + esi]  // row1 pointer
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
@@ -1132,11 +1144,11 @@
     movq       xmm3, qword ptr [esi + ebx * 2]
     movhps     xmm3, qword ptr [esi + edi]
     lea        esi,  [esi + ebx * 4]
-    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm0, xmm2  // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1151,29 +1163,31 @@
 }
 
 // Column scaling unfiltered. SSE2 version.
-__declspec(naked)
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
+                                          const uint8* src_argb,
+                                          int dst_width,
+                                          int x,
+                                          int dx) {
   __asm {
     push       edi
     push       esi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
 
-    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
-    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
     paddd      xmm2, xmm0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
-    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
-    paddd      xmm2, xmm0            // x3 x2 x1 x0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
-    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0  // x3 x2 x1 x0
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
 
-    pextrw     eax, xmm2, 1          // get x0 integer.
-    pextrw     edx, xmm2, 3          // get x1 integer.
+    pextrw     eax, xmm2, 1  // get x0 integer.
+    pextrw     edx, xmm2, 3  // get x1 integer.
 
     cmp        ecx, 0
     jle        xloop99
@@ -1184,20 +1198,20 @@
  xloop4:
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    pextrw     edx, xmm2, 7           // get x3 integer.
-    paddd      xmm2, xmm3             // x += dx
-    punpckldq  xmm0, xmm1             // x0 x1
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    pextrw     edx, xmm2, 7  // get x3 integer.
+    paddd      xmm2, xmm3  // x += dx
+    punpckldq  xmm0, xmm1  // x0 x1
 
     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
-    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
-    punpckldq  xmm1, xmm4             // x2 x3
-    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4  // x2 x3
+    punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
     movdqu     [edi], xmm0
     lea        edi, [edi + 16]
-    sub        ecx, 4                 // 4 pixels
+    sub        ecx, 4  // 4 pixels
     jge        xloop4
 
  xloop49:
@@ -1207,8 +1221,8 @@
     // 2 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    punpckldq  xmm0, xmm1             // x0 x1
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    punpckldq  xmm0, xmm1  // x0 x1
 
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
@@ -1233,59 +1247,61 @@
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
 static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
 static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
-__declspec(naked)
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
+                                                 const uint8* src_argb,
+                                                 int dst_width,
+                                                 int x,
+                                                 int dx) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
     movdqa     xmm4, xmmword ptr kShuffleColARGB
     movdqa     xmm5, xmmword ptr kShuffleFractions
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
     psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29
 
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
     paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
 
     // 2 Pixel loop.
   xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    psrlw      xmm1, 9              // 7 bit fractions.
+    psrlw      xmm1, 9  // 7 bit fractions.
     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
-    pshufb     xmm1, xmm5           // 0000000011111111
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
-    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    pshufb     xmm1, xmm5  // 0000000011111111
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
-    sub        ecx, 2               // 2 pixels
+    sub        ecx, 2  // 2 pixels
     jge        xloop2
 
  xloop29:
@@ -1293,15 +1309,15 @@
     add        ecx, 2 - 1
     jl         xloop99
 
-    // 1 pixel remainder
-    psrlw      xmm2, 9              // 7 bit fractions.
+        // 1 pixel remainder
+    psrlw      xmm2, 9  // 7 bit fractions.
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    pshufb     xmm2, xmm5           // 00000000
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    pshufb     xmm2, xmm5  // 00000000
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
     movd       [edi], xmm0
 
  xloop99:
@@ -1313,13 +1329,15 @@
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked)
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
+                                             const uint8* src_argb,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
   __asm {
-    mov        edx, [esp + 4]    // dst_argb
-    mov        eax, [esp + 8]    // src_argb
-    mov        ecx, [esp + 12]   // dst_width
+    mov        edx, [esp + 4]  // dst_argb
+    mov        eax, [esp + 8]  // src_argb
+    mov        ecx, [esp + 12]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1338,12 +1356,11 @@
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv_X86(int num, int div) {
+__declspec(naked) int FixedDiv_X86(int num, int div) {
   __asm {
-    mov        eax, [esp + 4]    // num
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
+    mov        eax, [esp + 4]  // num
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
     shl        eax, 16
     idiv       dword ptr [esp + 8]
     ret
@@ -1351,13 +1368,12 @@
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv1_X86(int num, int div) {
+__declspec(naked) int FixedDiv1_X86(int num, int div) {
   __asm {
-    mov        eax, [esp + 4]    // num
-    mov        ecx, [esp + 8]    // denom
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
+    mov        eax, [esp + 4]  // num
+    mov        ecx, [esp + 8]  // denom
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
     shl        eax, 16
     sub        eax, 0x00010001
     sbb        edx, 0

diff --git a/files/source/video_common.cc b/files/source/video_common.cc
index 00fb71e..3e9c6a2 100644
--- a/files/source/video_common.cc
+++ b/files/source/video_common.cc

@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "libyuv/video_common.h"
 
 #ifdef __cplusplus
@@ -24,24 +23,24 @@
 };
 
 static const struct FourCCAliasEntry kFourCCAliases[] = {
-  {FOURCC_IYUV, FOURCC_I420},
-  {FOURCC_YU12, FOURCC_I420},
-  {FOURCC_YU16, FOURCC_I422},
-  {FOURCC_YU24, FOURCC_I444},
-  {FOURCC_YUYV, FOURCC_YUY2},
-  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
-  {FOURCC_HDYC, FOURCC_UYVY},
-  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
-  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
-  {FOURCC_DMB1, FOURCC_MJPG},
-  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
-  {FOURCC_RGB3, FOURCC_RAW },
-  {FOURCC_BGR3, FOURCC_24BG},
-  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
-  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
-  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
-  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
-  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+    {FOURCC_IYUV, FOURCC_I420},
+    {FOURCC_YU12, FOURCC_I420},
+    {FOURCC_YU16, FOURCC_I422},
+    {FOURCC_YU24, FOURCC_I444},
+    {FOURCC_YUYV, FOURCC_YUY2},
+    {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+    {FOURCC_HDYC, FOURCC_UYVY},
+    {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+    {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+    {FOURCC_DMB1, FOURCC_MJPG},
+    {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+    {FOURCC_RGB3, FOURCC_RAW},
+    {FOURCC_BGR3, FOURCC_24BG},
+    {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+    {FOURCC_CM24, FOURCC_RAW},   // kCMPixelFormat_24RGB
+    {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+    {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+    {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
 };
 // TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
 //  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
@@ -62,4 +61,3 @@
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
commit	b83bb38f0a92bedeb52baa31e515220927ef53bb	[log] [tgz]
author	Frank Barchard <fbarchard@google.com>	Wed Feb 22 18:01:07 2017 -0800
committer	Frank Barchard <fbarchard@google.com>	Mon Mar 06 09:54:15 2017 -0800
tree	a31c9da19db3f909cad22293ad2964d1c41c953a
parent	04676c9f110180a5ae1fa259a38fab17101c6b5b [diff]