Merge "MIPS: Implement Sum-of-Abs-Differences"
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index 384b642..3cf150a 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -1071,11 +1071,195 @@
 
 void LocationsBuilderMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
   CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+  LocationSummary* locations = instruction->GetLocations();
+  // All conversions require at least one temporary register.
+  locations->AddTemp(Location::RequiresFpuRegister());
+  // Some conversions require a second temporary register.
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
+            HVecOperation::ToSignedType(b->GetPackedType()));
+  switch (a->GetPackedType()) {
+    case DataType::Type::kInt32:
+      if (instruction->GetPackedType() == DataType::Type::kInt32) {
+        break;
+      }
+      FALLTHROUGH_INTENDED;
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      locations->AddTemp(Location::RequiresFpuRegister());
+      break;
+    default:
+      break;
+  }
 }
 
 void InstructionCodeGeneratorMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  // TODO: implement this, location helper already filled out (shared with MulAcc).
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister acc = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister left = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister right = VectorRegisterFrom(locations->InAt(2));
+  VectorRegister tmp = static_cast<VectorRegister>(FTMP);
+  VectorRegister tmp1 = VectorRegisterFrom(locations->GetTemp(0));
+
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+
+  // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S).
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
+            HVecOperation::ToSignedType(b->GetPackedType()));
+  switch (a->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case DataType::Type::kUint16:
+        case DataType::Type::kInt16: {
+          DCHECK_EQ(8u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillB(tmp, ZERO);
+          __ Hadd_sH(tmp1, left, tmp);
+          __ Hadd_sH(tmp2, right, tmp);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ AddvH(acc, acc, tmp1);
+          __ Hadd_sH(tmp1, tmp, left);
+          __ Hadd_sH(tmp2, tmp, right);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ AddvH(acc, acc, tmp1);
+          break;
+        }
+        case DataType::Type::kInt32: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillB(tmp, ZERO);
+          __ Hadd_sH(tmp1, left, tmp);
+          __ Hadd_sH(tmp2, right, tmp);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ Hadd_sW(tmp1, tmp1, tmp1);
+          __ AddvW(acc, acc, tmp1);
+          __ Hadd_sH(tmp1, tmp, left);
+          __ Hadd_sH(tmp2, tmp, right);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ Hadd_sW(tmp1, tmp1, tmp1);
+          __ AddvW(acc, acc, tmp1);
+          break;
+        }
+        case DataType::Type::kInt64: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillB(tmp, ZERO);
+          __ Hadd_sH(tmp1, left, tmp);
+          __ Hadd_sH(tmp2, right, tmp);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ Hadd_sW(tmp1, tmp1, tmp1);
+          __ Hadd_sD(tmp1, tmp1, tmp1);
+          __ AddvD(acc, acc, tmp1);
+          __ Hadd_sH(tmp1, tmp, left);
+          __ Hadd_sH(tmp2, tmp, right);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ Hadd_sW(tmp1, tmp1, tmp1);
+          __ Hadd_sD(tmp1, tmp1, tmp1);
+          __ AddvD(acc, acc, tmp1);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case DataType::Type::kInt32: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillH(tmp, ZERO);
+          __ Hadd_sW(tmp1, left, tmp);
+          __ Hadd_sW(tmp2, right, tmp);
+          __ Asub_sW(tmp1, tmp1, tmp2);
+          __ AddvW(acc, acc, tmp1);
+          __ Hadd_sW(tmp1, tmp, left);
+          __ Hadd_sW(tmp2, tmp, right);
+          __ Asub_sW(tmp1, tmp1, tmp2);
+          __ AddvW(acc, acc, tmp1);
+          break;
+        }
+        case DataType::Type::kInt64: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillH(tmp, ZERO);
+          __ Hadd_sW(tmp1, left, tmp);
+          __ Hadd_sW(tmp2, right, tmp);
+          __ Asub_sW(tmp1, tmp1, tmp2);
+          __ Hadd_sD(tmp1, tmp1, tmp1);
+          __ AddvD(acc, acc, tmp1);
+          __ Hadd_sW(tmp1, tmp, left);
+          __ Hadd_sW(tmp2, tmp, right);
+          __ Asub_sW(tmp1, tmp1, tmp2);
+          __ Hadd_sD(tmp1, tmp1, tmp1);
+          __ AddvD(acc, acc, tmp1);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case DataType::Type::kInt32: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          __ FillW(tmp, ZERO);
+          __ SubvW(tmp1, left, right);
+          __ Add_aW(tmp1, tmp1, tmp);
+          __ AddvW(acc, acc, tmp1);
+          break;
+        }
+        case DataType::Type::kInt64: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillW(tmp, ZERO);
+          __ Hadd_sD(tmp1, left, tmp);
+          __ Hadd_sD(tmp2, right, tmp);
+          __ Asub_sD(tmp1, tmp1, tmp2);
+          __ AddvD(acc, acc, tmp1);
+          __ Hadd_sD(tmp1, tmp, left);
+          __ Hadd_sD(tmp2, tmp, right);
+          __ Asub_sD(tmp1, tmp1, tmp2);
+          __ AddvD(acc, acc, tmp1);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case DataType::Type::kInt64: {
+      DCHECK_EQ(2u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case DataType::Type::kInt64: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          __ FillW(tmp, ZERO);
+          __ SubvD(tmp1, left, right);
+          __ Add_aD(tmp1, tmp1, tmp);
+          __ AddvD(acc, acc, tmp1);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector memory operations.
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index 0c59b73..2d69533 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -1069,11 +1069,195 @@
 
 void LocationsBuilderMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
   CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+  LocationSummary* locations = instruction->GetLocations();
+  // All conversions require at least one temporary register.
+  locations->AddTemp(Location::RequiresFpuRegister());
+  // Some conversions require a second temporary register.
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
+            HVecOperation::ToSignedType(b->GetPackedType()));
+  switch (a->GetPackedType()) {
+    case DataType::Type::kInt32:
+      if (instruction->GetPackedType() == DataType::Type::kInt32) {
+        break;
+      }
+      FALLTHROUGH_INTENDED;
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      locations->AddTemp(Location::RequiresFpuRegister());
+      break;
+    default:
+      break;
+  }
 }
 
 void InstructionCodeGeneratorMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  // TODO: implement this, location helper already filled out (shared with MulAcc).
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister acc = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister left = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister right = VectorRegisterFrom(locations->InAt(2));
+  VectorRegister tmp = static_cast<VectorRegister>(FTMP);
+  VectorRegister tmp1 = VectorRegisterFrom(locations->GetTemp(0));
+
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+
+  // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S).
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
+            HVecOperation::ToSignedType(b->GetPackedType()));
+  switch (a->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case DataType::Type::kUint16:
+        case DataType::Type::kInt16: {
+          DCHECK_EQ(8u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillB(tmp, ZERO);
+          __ Hadd_sH(tmp1, left, tmp);
+          __ Hadd_sH(tmp2, right, tmp);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ AddvH(acc, acc, tmp1);
+          __ Hadd_sH(tmp1, tmp, left);
+          __ Hadd_sH(tmp2, tmp, right);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ AddvH(acc, acc, tmp1);
+          break;
+        }
+        case DataType::Type::kInt32: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillB(tmp, ZERO);
+          __ Hadd_sH(tmp1, left, tmp);
+          __ Hadd_sH(tmp2, right, tmp);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ Hadd_sW(tmp1, tmp1, tmp1);
+          __ AddvW(acc, acc, tmp1);
+          __ Hadd_sH(tmp1, tmp, left);
+          __ Hadd_sH(tmp2, tmp, right);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ Hadd_sW(tmp1, tmp1, tmp1);
+          __ AddvW(acc, acc, tmp1);
+          break;
+        }
+        case DataType::Type::kInt64: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillB(tmp, ZERO);
+          __ Hadd_sH(tmp1, left, tmp);
+          __ Hadd_sH(tmp2, right, tmp);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ Hadd_sW(tmp1, tmp1, tmp1);
+          __ Hadd_sD(tmp1, tmp1, tmp1);
+          __ AddvD(acc, acc, tmp1);
+          __ Hadd_sH(tmp1, tmp, left);
+          __ Hadd_sH(tmp2, tmp, right);
+          __ Asub_sH(tmp1, tmp1, tmp2);
+          __ Hadd_sW(tmp1, tmp1, tmp1);
+          __ Hadd_sD(tmp1, tmp1, tmp1);
+          __ AddvD(acc, acc, tmp1);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case DataType::Type::kInt32: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillH(tmp, ZERO);
+          __ Hadd_sW(tmp1, left, tmp);
+          __ Hadd_sW(tmp2, right, tmp);
+          __ Asub_sW(tmp1, tmp1, tmp2);
+          __ AddvW(acc, acc, tmp1);
+          __ Hadd_sW(tmp1, tmp, left);
+          __ Hadd_sW(tmp2, tmp, right);
+          __ Asub_sW(tmp1, tmp1, tmp2);
+          __ AddvW(acc, acc, tmp1);
+          break;
+        }
+        case DataType::Type::kInt64: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillH(tmp, ZERO);
+          __ Hadd_sW(tmp1, left, tmp);
+          __ Hadd_sW(tmp2, right, tmp);
+          __ Asub_sW(tmp1, tmp1, tmp2);
+          __ Hadd_sD(tmp1, tmp1, tmp1);
+          __ AddvD(acc, acc, tmp1);
+          __ Hadd_sW(tmp1, tmp, left);
+          __ Hadd_sW(tmp2, tmp, right);
+          __ Asub_sW(tmp1, tmp1, tmp2);
+          __ Hadd_sD(tmp1, tmp1, tmp1);
+          __ AddvD(acc, acc, tmp1);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case DataType::Type::kInt32:
+      DCHECK_EQ(4u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case DataType::Type::kInt32: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          __ FillW(tmp, ZERO);
+          __ SubvW(tmp1, left, right);
+          __ Add_aW(tmp1, tmp1, tmp);
+          __ AddvW(acc, acc, tmp1);
+          break;
+        }
+        case DataType::Type::kInt64: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VectorRegister tmp2 = VectorRegisterFrom(locations->GetTemp(1));
+          __ FillW(tmp, ZERO);
+          __ Hadd_sD(tmp1, left, tmp);
+          __ Hadd_sD(tmp2, right, tmp);
+          __ Asub_sD(tmp1, tmp1, tmp2);
+          __ AddvD(acc, acc, tmp1);
+          __ Hadd_sD(tmp1, tmp, left);
+          __ Hadd_sD(tmp2, tmp, right);
+          __ Asub_sD(tmp1, tmp1, tmp2);
+          __ AddvD(acc, acc, tmp1);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case DataType::Type::kInt64: {
+      DCHECK_EQ(2u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case DataType::Type::kInt64: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          __ FillD(tmp, ZERO);
+          __ SubvD(tmp1, left, right);
+          __ Add_aD(tmp1, tmp1, tmp);
+          __ AddvD(acc, acc, tmp1);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector memory operations.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 69c5827..fcc59ea 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -1512,17 +1512,17 @@
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
+            *restrictions |= kNoDiv;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
+            *restrictions |= kNoDiv | kNoStringCharAt;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
-            *restrictions |= kNoDiv | kNoSAD;
+            *restrictions |= kNoDiv;
             return TrySetVectorLength(4);
           case DataType::Type::kInt64:
-            *restrictions |= kNoDiv | kNoSAD;
+            *restrictions |= kNoDiv;
             return TrySetVectorLength(2);
           case DataType::Type::kFloat32:
             *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
@@ -1541,17 +1541,17 @@
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
+            *restrictions |= kNoDiv;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
+            *restrictions |= kNoDiv | kNoStringCharAt;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
-            *restrictions |= kNoDiv | kNoSAD;
+            *restrictions |= kNoDiv;
             return TrySetVectorLength(4);
           case DataType::Type::kInt64:
-            *restrictions |= kNoDiv | kNoSAD;
+            *restrictions |= kNoDiv;
             return TrySetVectorLength(2);
           case DataType::Type::kFloat32:
             *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
diff --git a/test/651-checker-byte-simd-minmax/src/Main.java b/test/651-checker-byte-simd-minmax/src/Main.java
index d365689..2188346 100644
--- a/test/651-checker-byte-simd-minmax/src/Main.java
+++ b/test/651-checker-byte-simd-minmax/src/Main.java
@@ -183,6 +183,13 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>  outer_loop:none
   /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>       outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.doitMin100(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                      loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]        loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>  outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>       outer_loop:none
   private static void doitMin100(byte[] x, byte[] y) {
     int min = Math.min(x.length, y.length);
     for (int i = 0; i < min; i++) {
diff --git a/test/651-checker-char-simd-minmax/src/Main.java b/test/651-checker-char-simd-minmax/src/Main.java
index 72e8958..d92bdaf 100644
--- a/test/651-checker-char-simd-minmax/src/Main.java
+++ b/test/651-checker-char-simd-minmax/src/Main.java
@@ -97,6 +97,13 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
   /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>         outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.doitMin100(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                      loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]        loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>         outer_loop:none
   private static void doitMin100(char[] x, char[] y) {
     int min = Math.min(x.length, y.length);
     for (int i = 0; i < min; i++) {
diff --git a/test/651-checker-short-simd-minmax/src/Main.java b/test/651-checker-short-simd-minmax/src/Main.java
index d8c4d1e..91f2a2d 100644
--- a/test/651-checker-short-simd-minmax/src/Main.java
+++ b/test/651-checker-short-simd-minmax/src/Main.java
@@ -183,6 +183,13 @@
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>   outer_loop:none
   /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>        outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.doitMin100(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                      loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]        loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>   outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>        outer_loop:none
   private static void doitMin100(short[] x, short[] y) {
     int min = Math.min(x.length, y.length);
     for (int i = 0; i < min; i++) {
diff --git a/test/656-checker-simd-opt/src/Main.java b/test/656-checker-simd-opt/src/Main.java
index 39a126f..31d28e8 100644
--- a/test/656-checker-simd-opt/src/Main.java
+++ b/test/656-checker-simd-opt/src/Main.java
@@ -114,6 +114,19 @@
   /// CHECK-DAG: <<Phi2:d\d+>>  Phi [<<Set>>,{{d\d+}}]       loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                VecAdd [<<Phi2>>,<<Rep>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                Add [<<Phi1>>,<<L2>>]        loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.longInductionReduction(long[]) loop_optimization (after)
+  /// CHECK-DAG: <<L0:j\d+>>    LongConstant 0               loop:none
+  /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1               loop:none
+  /// CHECK-DAG: <<L2:j\d+>>    LongConstant 2               loop:none
+  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                loop:none
+  /// CHECK-DAG: <<Get:j\d+>>   ArrayGet [{{l\d+}},<<I0>>]   loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>   VecReplicateScalar [<<Get>>] loop:none
+  /// CHECK-DAG: <<Set:d\d+>>   VecSetScalars [<<L1>>]       loop:none
+  /// CHECK-DAG: <<Phi1:j\d+>>  Phi [<<L0>>,{{j\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>  Phi [<<Set>>,{{d\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                VecAdd [<<Phi2>>,<<Rep>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                Add [<<Phi1>>,<<L2>>]        loop:<<Loop>>      outer_loop:none
   static long longInductionReduction(long[] y) {
     long x = 1;
     for (long i = 0; i < 10; i++) {
@@ -141,6 +154,17 @@
   /// CHECK-DAG: <<Phi:i\d+>>   Phi [<<I0>>,{{i\d+}}]               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:                VecStore [{{l\d+}},<<Phi>>,<<Rep>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                Add [<<Phi>>,<<I4>>]                loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.intVectorLongInvariant(int[], long[]) loop_optimization (after)
+  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                       loop:none
+  /// CHECK-DAG: <<I1:i\d+>>    IntConstant 1                       loop:none
+  /// CHECK-DAG: <<I4:i\d+>>    IntConstant 4                       loop:none
+  /// CHECK-DAG: <<Get:j\d+>>   ArrayGet [{{l\d+}},<<I0>>]          loop:none
+  /// CHECK-DAG: <<Cnv:i\d+>>   TypeConversion [<<Get>>]            loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>   VecReplicateScalar [<<Cnv>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>   Phi [<<I0>>,{{i\d+}}]               loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:                VecStore [{{l\d+}},<<Phi>>,<<Rep>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                Add [<<Phi>>,<<I4>>]                loop:<<Loop>>      outer_loop:none
   static void intVectorLongInvariant(int[] x, long[] y) {
     for (int i = 0; i < 100; i++) {
       x[i] = (int) y[0];
@@ -170,6 +194,18 @@
   /// CHECK-DAG: <<Add:d\d+>>   VecAdd [<<Load>>,<<Rep>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                VecStore [{{l\d+}},<<Phi>>,<<Add>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                Add [<<Phi>>,<<I4>>]                loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: void Main.longCanBeDoneWithInt(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                       loop:none
+  /// CHECK-DAG: <<I4:i\d+>>    IntConstant 4                       loop:none
+  /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1                      loop:none
+  /// CHECK-DAG: <<Cnv:i\d+>>   TypeConversion [<<L1>>]             loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>   VecReplicateScalar [<<Cnv>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>   Phi [<<I0>>,{{i\d+}}]               loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>  VecLoad [{{l\d+}},<<Phi>>]          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add:d\d+>>   VecAdd [<<Load>>,<<Rep>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                VecStore [{{l\d+}},<<Phi>>,<<Add>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                Add [<<Phi>>,<<I4>>]                loop:<<Loop>>      outer_loop:none
   static void longCanBeDoneWithInt(int[] x, int[] y) {
     for (int i = 0; i < 100; i++) {
       x[i] = (int) (y[i] + 1L);
diff --git a/test/660-checker-simd-sad-byte/src/Main.java b/test/660-checker-simd-sad-byte/src/Main.java
index 72d1c24..877d718 100644
--- a/test/660-checker-simd-sad-byte/src/Main.java
+++ b/test/660-checker-simd-sad-byte/src/Main.java
@@ -109,6 +109,17 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadByte2Int(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
   private static int sadByte2Int(byte[] b1, byte[] b2) {
     int min_length = Math.min(b1.length, b2.length);
     int sad = 0;
@@ -140,6 +151,17 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadByte2IntAlt(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
   private static int sadByte2IntAlt(byte[] b1, byte[] b2) {
     int min_length = Math.min(b1.length, b2.length);
     int sad = 0;
@@ -173,6 +195,17 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadByte2IntAlt2(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
   private static int sadByte2IntAlt2(byte[] b1, byte[] b2) {
     int min_length = Math.min(b1.length, b2.length);
     int sad = 0;
@@ -212,6 +245,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadByte2Long(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
   private static long sadByte2Long(byte[] b1, byte[] b2) {
     int min_length = Math.min(b1.length, b2.length);
     long sad = 0;
@@ -249,6 +294,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadByte2LongAt1(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
   private static long sadByte2LongAt1(byte[] b1, byte[] b2) {
     int min_length = Math.min(b1.length, b2.length);
     long sad = 1;  // starts at 1
diff --git a/test/660-checker-simd-sad-char/src/Main.java b/test/660-checker-simd-sad-char/src/Main.java
index 2535d49..ba22614 100644
--- a/test/660-checker-simd-sad-char/src/Main.java
+++ b/test/660-checker-simd-sad-char/src/Main.java
@@ -68,7 +68,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: int Main.sadChar2Int(char[], char[]) loop_optimization (after)
+  /// CHECK-START: int Main.sadChar2Int(char[], char[]) loop_optimization (after)
   /// CHECK-NOT: VecSADAccumulate
   private static int sadChar2Int(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
@@ -91,7 +91,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: int Main.sadChar2IntAlt(char[], char[]) loop_optimization (after)
+  /// CHECK-START: int Main.sadChar2IntAlt(char[], char[]) loop_optimization (after)
   /// CHECK-NOT: VecSADAccumulate
   private static int sadChar2IntAlt(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
@@ -116,7 +116,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: int Main.sadChar2IntAlt2(char[], char[]) loop_optimization (after)
+  /// CHECK-START: int Main.sadChar2IntAlt2(char[], char[]) loop_optimization (after)
   /// CHECK-NOT: VecSADAccumulate
   private static int sadChar2IntAlt2(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
@@ -146,7 +146,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadChar2Long(char[], char[]) loop_optimization (after)
+  /// CHECK-START: long Main.sadChar2Long(char[], char[]) loop_optimization (after)
   /// CHECK-NOT: VecSADAccumulate
   private static long sadChar2Long(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
@@ -174,7 +174,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadChar2LongAt1(char[], char[]) loop_optimization (after)
+  /// CHECK-START: long Main.sadChar2LongAt1(char[], char[]) loop_optimization (after)
   /// CHECK-NOT: VecSADAccumulate
   private static long sadChar2LongAt1(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
diff --git a/test/660-checker-simd-sad-int/src/Main.java b/test/660-checker-simd-sad-int/src/Main.java
index 388bfba..d7d5a95 100644
--- a/test/660-checker-simd-sad-int/src/Main.java
+++ b/test/660-checker-simd-sad-int/src/Main.java
@@ -48,6 +48,15 @@
   /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]                      loop:<<Loop>> outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadInt2Int(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                              loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]                   loop:none
+  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]                     loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Ld1:d\d+>>    VecLoad [{{l\d+}},<<I:i\d+>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]                      loop:<<Loop>> outer_loop:none
   private static int sadInt2Int(int[] x, int[] y) {
     int min_length = Math.min(x.length, y.length);
     int sad = 0;
@@ -72,10 +81,7 @@
   //
   // No ABS? No SAD!
   //
-  /// CHECK-START-ARM: int Main.sadInt2IntAlt(int[], int[]) loop_optimization (after)
-  /// CHECK-NOT: VecSADAccumulate
-  //
-  /// CHECK-START-ARM64: int Main.sadInt2IntAlt(int[], int[]) loop_optimization (after)
+  /// CHECK-START: int Main.sadInt2IntAlt(int[], int[]) loop_optimization (after)
   /// CHECK-NOT: VecSADAccumulate
   private static int sadInt2IntAlt(int[] x, int[] y) {
     int min_length = Math.min(x.length, y.length);
@@ -117,6 +123,15 @@
   /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]                      loop:<<Loop>> outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadInt2IntAlt2(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                              loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]                   loop:none
+  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]                     loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Ld1:d\d+>>    VecLoad [{{l\d+}},<<I:i\d+>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]                      loop:<<Loop>> outer_loop:none
   private static int sadInt2IntAlt2(int[] x, int[] y) {
     int min_length = Math.min(x.length, y.length);
     int sad = 0;
@@ -156,6 +171,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadInt2Long(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
   private static long sadInt2Long(int[] x, int[] y) {
     int min_length = Math.min(x.length, y.length);
     long sad = 0;
@@ -193,6 +220,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadInt2LongAt1(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
   private static long sadInt2LongAt1(int[] x, int[] y) {
     int min_length = Math.min(x.length, y.length);
     long sad = 1;  // starts at 1
diff --git a/test/660-checker-simd-sad-long/src/Main.java b/test/660-checker-simd-sad-long/src/Main.java
index 06f62bd..d080e0c 100644
--- a/test/660-checker-simd-sad-long/src/Main.java
+++ b/test/660-checker-simd-sad-long/src/Main.java
@@ -43,6 +43,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadLong2Long(long[], long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
   private static long sadLong2Long(long[] x, long[] y) {
     int min_length = Math.min(x.length, y.length);
     long sad = 0;
@@ -105,6 +117,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadLong2LongAlt2(long[], long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
   private static long sadLong2LongAlt2(long[] x, long[] y) {
     int min_length = Math.min(x.length, y.length);
     long sad = 0;
@@ -142,6 +166,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadLong2LongAt1(long[], long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
   private static long sadLong2LongAt1(long[] x, long[] y) {
     int min_length = Math.min(x.length, y.length);
     long sad = 1;  // starts at 1
diff --git a/test/660-checker-simd-sad-short/src/Main.java b/test/660-checker-simd-sad-short/src/Main.java
index d94308e..4ab6682 100644
--- a/test/660-checker-simd-sad-short/src/Main.java
+++ b/test/660-checker-simd-sad-short/src/Main.java
@@ -76,6 +76,17 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2Int(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadShort2Int(short[] s1, short[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     int sad = 0;
@@ -107,6 +118,17 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntAlt(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadShort2IntAlt(short[] s1, short[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     int sad = 0;
@@ -140,6 +162,17 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntAlt2(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadShort2IntAlt2(short[] s1, short[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     int sad = 0;
@@ -179,6 +212,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadShort2Long(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static long sadShort2Long(short[] s1, short[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     long sad = 0;
@@ -216,6 +261,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadShort2LongAt1(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static long sadShort2LongAt1(short[] s1, short[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     long sad = 1;  // starts at 1
diff --git a/test/660-checker-simd-sad-short2/src/Main.java b/test/660-checker-simd-sad-short2/src/Main.java
index 708f3aa..331f5ce 100644
--- a/test/660-checker-simd-sad-short2/src/Main.java
+++ b/test/660-checker-simd-sad-short2/src/Main.java
@@ -94,6 +94,17 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadCastedChar2Int(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadCastedChar2Int(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     int sad = 0;
@@ -144,6 +155,17 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadCastedChar2IntAlt(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadCastedChar2IntAlt(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     int sad = 0;
@@ -196,6 +218,17 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadCastedChar2IntAlt2(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadCastedChar2IntAlt2(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     int sad = 0;
@@ -254,6 +287,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadCastedChar2Long(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static long sadCastedChar2Long(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     long sad = 0;
@@ -310,6 +355,18 @@
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: long Main.sadCastedChar2LongAt1(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static long sadCastedChar2LongAt1(char[] s1, char[] s2) {
     int min_length = Math.min(s1.length, s2.length);
     long sad = 1;  // starts at 1
diff --git a/test/660-checker-simd-sad-short3/src/Main.java b/test/660-checker-simd-sad-short3/src/Main.java
index c8850b4..ecda884 100644
--- a/test/660-checker-simd-sad-short3/src/Main.java
+++ b/test/660-checker-simd-sad-short3/src/Main.java
@@ -44,6 +44,18 @@
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load>>,<<Rep>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntParamRight(short[], short) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Param:s\d+>>  ParameterValue                 loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<Param>>] loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load>>,<<Rep>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadShort2IntParamRight(short[] s, short param) {
     int sad = 0;
     for (int i = 0; i < s.length; i++) {
@@ -75,6 +87,18 @@
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Rep>>,<<Load>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntParamLeft(short[], short) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Param:s\d+>>  ParameterValue                 loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<Param>>] loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Rep>>,<<Load>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadShort2IntParamLeft(short[] s, short param) {
     int sad = 0;
     for (int i = 0; i < s.length; i++) {
@@ -106,6 +130,18 @@
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load>>,<<Rep>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntConstRight(short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsI:i\d+>>  IntConstant 32767              loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<ConsI>>] loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load>>,<<Rep>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadShort2IntConstRight(short[] s) {
     int sad = 0;
     for (int i = 0; i < s.length; i++) {
@@ -137,6 +173,18 @@
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Rep>>,<<Load>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntConstLeft(short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsI:i\d+>>  IntConstant 32767              loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<ConsI>>] loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Rep>>,<<Load>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadShort2IntConstLeft(short[] s) {
     int sad = 0;
     for (int i = 0; i < s.length; i++) {
@@ -168,6 +216,18 @@
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load>>,<<Rep>>] loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntInvariantRight(short[], int) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Conv:s\d+>>   TypeConversion [{{i\d+}}]      loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<Conv>>]  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load>>,<<Rep>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
   private static int sadShort2IntInvariantRight(short[] s, int val) {
     int sad = 0;
     short x = (short) (val + 1);
@@ -199,6 +259,17 @@
   /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Rep>>,<<Load>>] loop:<<Loop>> outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntInvariantLeft(short[], int) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Conv:s\d+>>   TypeConversion [{{i\d+}}]      loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<Conv>>]  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Rep>>,<<Load>>] loop:<<Loop>> outer_loop:none
   private static int sadShort2IntInvariantLeft(short[] s, int val) {
     int sad = 0;
     short x = (short) (val + 1);
@@ -233,6 +304,18 @@
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Add:d\d+>>    VecAdd [<<Load>>,<<Rep>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load>>,<<Add>>] loop:<<Loop>> outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntCastedExprRight(short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsI:i\d+>>  IntConstant 110                loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<ConsI>>] loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add:d\d+>>    VecAdd [<<Load>>,<<Rep>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load>>,<<Add>>] loop:<<Loop>> outer_loop:none
   private static int sadShort2IntCastedExprRight(short[] s) {
     int sad = 0;
     for (int i = 0; i < s.length; i++) {
@@ -267,6 +350,18 @@
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Add:d\d+>>    VecAdd [<<Load>>,<<Rep>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Add>>,<<Load>>] loop:<<Loop>> outer_loop:none
+  //
+  /// CHECK-START-MIPS64: int Main.sadShort2IntCastedExprLeft(short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsI:i\d+>>  IntConstant 110                loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>    VecReplicateScalar [<<ConsI>>] loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add:d\d+>>    VecAdd [<<Load>>,<<Rep>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Add>>,<<Load>>] loop:<<Loop>> outer_loop:none
   private static int sadShort2IntCastedExprLeft(short[] s) {
     int sad = 0;
     for (int i = 0; i < s.length; i++) {