SplitUV_Any variations for all CPUS.
BUG=126
TEST=convert tests NV12 with alignments
Review URL: https://webrtc-codereview.appspot.com/896007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@426 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_mips.cc b/source/row_mips.cc
index 3e1a1fa..7e4125d 100644
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -23,21 +23,77 @@
   __asm__ __volatile__ (
     ".set push                                     \n"
     ".set noreorder                                \n"
-
     "srl             $t4, %[width], 4              \n"  // multiplies of 16
     "blez            $t4, 2f                       \n"
     " andi           %[width], %[width], 0xf       \n"  // residual
-    "andi            $t0, %[src_uv], 0x3           \n"
-    "andi            $t1, %[dst_u], 0x3            \n"
-    "andi            $t2, %[dst_v], 0x3            \n"
-    "or              $t0, $t0, $t1                 \n"
-    "or              $t0, $t0, $t2                 \n"
 
-    "beqz            $t0, 12f                      \n"  // test if aligned
+  "1:                                              \n"
+    "addiu           $t4, $t4, -1                  \n"
+    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
+    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
+    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
+    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
+    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
+    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
+    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
+    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
+    "addiu           %[src_uv], %[src_uv], 32      \n"
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "sw              $t9, 0(%[dst_v])              \n"
+    "sw              $t0, 0(%[dst_u])              \n"
+    "sw              $t1, 4(%[dst_v])              \n"
+    "sw              $t2, 4(%[dst_u])              \n"
+    "sw              $t3, 8(%[dst_v])              \n"
+    "sw              $t5, 8(%[dst_u])              \n"
+    "sw              $t6, 12(%[dst_v])             \n"
+    "sw              $t7, 12(%[dst_u])             \n"
+    "addiu           %[dst_v], %[dst_v], 16        \n"
+    "bgtz            $t4, 1b                       \n"
+    " addiu          %[dst_u], %[dst_u], 16        \n"
+
+    "beqz            %[width], 3f                  \n"
     " nop                                          \n"
 
-    // src and dst are unaligned
-    "1:                                            \n"
+  "2:                                              \n"
+    "lbu             $t0, 0(%[src_uv])             \n"
+    "lbu             $t1, 1(%[src_uv])             \n"
+    "addiu           %[src_uv], %[src_uv], 2       \n"
+    "addiu           %[width], %[width], -1        \n"
+    "sb              $t0, 0(%[dst_u])              \n"
+    "sb              $t1, 0(%[dst_v])              \n"
+    "addiu           %[dst_u], %[dst_u], 1         \n"
+    "bgtz            %[width], 2b                  \n"
+    " addiu          %[dst_v], %[dst_v], 1         \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+     : [src_uv] "+r" (src_uv),
+       [width] "+r" (width),
+       [dst_u] "+r" (dst_u),
+       [dst_v] "+r" (dst_v)
+     :
+     : "t0", "t1", "t2", "t3",
+       "t4", "t5", "t6", "t7", "t8", "t9"
+  );
+}
+
+void SplitUV_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
+                                  uint8* dst_v, int width) {
+  __asm__ __volatile__ (
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+    "srl             $t4, %[width], 4              \n"  // multiplies of 16
+    "blez            $t4, 2f                       \n"
+    " andi           %[width], %[width], 0xf       \n"  // residual
+
+  "1:                                              \n"
     "addiu           $t4, $t4, -1                  \n"
     "lwr             $t0, 0(%[src_uv])             \n"
     "lwl             $t0, 3(%[src_uv])             \n"  // V1 | U1 | V0 | U0
@@ -55,7 +111,6 @@
     "lwl             $t7, 27(%[src_uv])            \n"  // V13 | U13 | V12 | U12
     "lwr             $t8, 28(%[src_uv])            \n"
     "lwl             $t8, 31(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-
     "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
     "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
     "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
@@ -65,7 +120,6 @@
     "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
     "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
     "addiu           %[src_uv], %[src_uv], 32      \n"
-
     "swr             $t9, 0(%[dst_v])              \n"
     "swl             $t9, 3(%[dst_v])              \n"
     "swr             $t0, 0(%[dst_u])              \n"
@@ -88,47 +142,8 @@
 
     "beqz            %[width], 3f                  \n"
     " nop                                          \n"
-    "b               2f                            \n"
-    " nop                                          \n"
 
-    // src and dst are aligned
-    "12:                                           \n"
-    "addiu           $t4, $t4, -1                  \n"
-    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
-    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-
-    "addiu           %[src_uv], %[src_uv], 32      \n"
-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
-
-    "sw              $t9, 0(%[dst_v])              \n"
-    "sw              $t0, 0(%[dst_u])              \n"
-    "sw              $t1, 4(%[dst_v])              \n"
-    "sw              $t2, 4(%[dst_u])              \n"
-    "sw              $t3, 8(%[dst_v])              \n"
-    "sw              $t5, 8(%[dst_u])              \n"
-    "sw              $t6, 12(%[dst_v])             \n"
-    "sw              $t7, 12(%[dst_u])             \n"
-    "addiu           %[dst_v], %[dst_v], 16        \n"
-    "bgtz            $t4, 12b                      \n"
-    " addiu          %[dst_u], %[dst_u], 16        \n"
-
-    "beqz            %[width], 3f                  \n"
-    " nop                                          \n"
-
-    "2:                                            \n"
+  "2:                                              \n"
     "lbu             $t0, 0(%[src_uv])             \n"
     "lbu             $t1, 1(%[src_uv])             \n"
     "addiu           %[src_uv], %[src_uv], 2       \n"
@@ -139,7 +154,7 @@
     "bgtz            %[width], 2b                  \n"
     " addiu          %[dst_v], %[dst_v], 1         \n"
 
-    "3:                                            \n"
+  "3:                                              \n"
     ".set pop                                      \n"
      : [src_uv] "+r" (src_uv),
        [width] "+r" (width),