Merge "Set basic framework for detecting reductions."
diff --git a/compiler/optimizing/induction_var_range.h b/compiler/optimizing/induction_var_range.h
index ab1772b..0b980f5 100644
--- a/compiler/optimizing/induction_var_range.h
+++ b/compiler/optimizing/induction_var_range.h
@@ -151,6 +151,16 @@
   }
 
   /**
+   * Checks if the given phi instruction has been classified as anything by
+   * induction variable analysis. Returns false for anything that cannot be
+   * classified statically, such as reductions or other complex cycles.
+   */
+  bool IsClassified(HPhi* phi) const {
+    HLoopInformation* lp = phi->GetBlock()->GetLoopInformation();  // closest enveloping loop
+    return (lp != nullptr) && (induction_analysis_->LookupInfo(lp, phi) != nullptr);
+  }
+
+  /**
    * Checks if header logic of a loop terminates. Sets trip-count tc if known.
    */
   bool IsFinite(HLoopInformation* loop, /*out*/ int64_t* tc) const;
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 0ef7dcd..4143462 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -256,6 +256,35 @@
   return false;
 }
 
+// Detect reductions of the following forms,
+// under assumption phi has only *one* use:
+//   x = x_phi + ..
+//   x = x_phi - ..
+//   x = max(x_phi, ..)
+//   x = min(x_phi, ..)
+static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) {
+  if (reduction->IsAdd()) {
+    return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+  } else if (reduction->IsSub()) {
+    return reduction->InputAt(0) == phi;
+  } else if (reduction->IsInvokeStaticOrDirect()) {
+    switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) {
+      case Intrinsics::kMathMinIntInt:
+      case Intrinsics::kMathMinLongLong:
+      case Intrinsics::kMathMinFloatFloat:
+      case Intrinsics::kMathMinDoubleDouble:
+      case Intrinsics::kMathMaxIntInt:
+      case Intrinsics::kMathMaxLongLong:
+      case Intrinsics::kMathMaxFloatFloat:
+      case Intrinsics::kMathMaxDoubleDouble:
+        return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+      default:
+        return false;
+    }
+  }
+  return false;
+}
+
 // Test vector restrictions.
 static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) {
   return (restrictions & tested) != 0;
@@ -280,12 +309,11 @@
       return false;
     }
   }
-
   return true;
 }
 
 //
-// Class methods.
+// Public methods.
 //
 
 HLoopOptimization::HLoopOptimization(HGraph* graph,
@@ -299,7 +327,7 @@
       top_loop_(nullptr),
       last_loop_(nullptr),
       iset_(nullptr),
-      induction_simplication_count_(0),
+      reductions_(nullptr),
       simplified_(false),
       vector_length_(0),
       vector_refs_(nullptr),
@@ -333,6 +361,10 @@
   last_loop_ = top_loop_ = nullptr;
 }
 
+//
+// Loop setup and traversal.
+//
+
 void HLoopOptimization::LocalRun() {
   // Build the linear order using the phase-local allocator. This step enables building
   // a loop hierarchy that properly reflects the outer-inner and previous-next relation.
@@ -351,17 +383,21 @@
   // should use the global allocator.
   if (top_loop_ != nullptr) {
     ArenaSet<HInstruction*> iset(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+    ArenaSafeMap<HInstruction*, HInstruction*> reds(
+        std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
     ArenaSet<ArrayReference> refs(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
     ArenaSafeMap<HInstruction*, HInstruction*> map(
         std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
     // Attach.
     iset_ = &iset;
+    reductions_ = &reds;
     vector_refs_ = &refs;
     vector_map_ = &map;
     // Traverse.
     TraverseLoopsInnerToOuter(top_loop_);
     // Detach.
     iset_ = nullptr;
+    reductions_ = nullptr;
     vector_refs_ = nullptr;
     vector_map_ = nullptr;
   }
@@ -414,16 +450,12 @@
   }
 }
 
-void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
+bool HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
+  bool changed = false;
   for ( ; node != nullptr; node = node->next) {
-    // Visit inner loops first.
-    uint32_t current_induction_simplification_count = induction_simplication_count_;
-    if (node->inner != nullptr) {
-      TraverseLoopsInnerToOuter(node->inner);
-    }
-    // Recompute induction information of this loop if the induction
-    // of any inner loop has been simplified.
-    if (current_induction_simplification_count != induction_simplication_count_) {
+    // Visit inner loops first. Recompute induction information for this
+    // loop if the induction of any inner loop has changed.
+    if (TraverseLoopsInnerToOuter(node->inner)) {
       induction_range_.ReVisit(node->loop_info);
     }
     // Repeat simplifications in the loop-body until no more changes occur.
@@ -433,12 +465,14 @@
       simplified_ = false;
       SimplifyInduction(node);
       SimplifyBlocks(node);
+      changed = simplified_ || changed;
     } while (simplified_);
     // Optimize inner loop.
     if (node->inner == nullptr) {
-      OptimizeInnerLoop(node);
+      changed = OptimizeInnerLoop(node) || changed;
     }
   }
+  return changed;
 }
 
 //
@@ -455,21 +489,14 @@
   //           for (int i = 0; i < 10; i++, k++) { .... no k .... } return k;
   for (HInstructionIterator it(header->GetPhis()); !it.Done(); it.Advance()) {
     HPhi* phi = it.Current()->AsPhi();
-    iset_->clear();  // prepare phi induction
     if (TrySetPhiInduction(phi, /*restrict_uses*/ true) &&
+        CanRemoveCycle() &&
         TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ false)) {
-      // Note that it's ok to have replaced uses after the loop with the last value, without
-      // being able to remove the cycle. Environment uses (which are the reason we may not be
-      // able to remove the cycle) within the loop will still hold the right value.
-      if (CanRemoveCycle()) {
-        for (HInstruction* i : *iset_) {
-          RemoveFromCycle(i);
-        }
-
-        // Check that there are no records of the deleted instructions.
-        DCHECK(CheckInductionSetFullyRemoved(iset_));
-        simplified_ = true;
+      simplified_ = true;
+      for (HInstruction* i : *iset_) {
+        RemoveFromCycle(i);
       }
+      DCHECK(CheckInductionSetFullyRemoved(iset_));
     }
   }
 }
@@ -511,21 +538,20 @@
   }
 }
 
-void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
   // Ensure loop header logic is finite.
   int64_t trip_count = 0;
   if (!induction_range_.IsFinite(node->loop_info, &trip_count)) {
-    return;
+    return false;
   }
-
   // Ensure there is only a single loop-body (besides the header).
   HBasicBlock* body = nullptr;
   for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
     if (it.Current() != header) {
       if (body != nullptr) {
-        return;
+        return false;
       }
       body = it.Current();
     }
@@ -533,27 +559,27 @@
   CHECK(body != nullptr);
   // Ensure there is only a single exit point.
   if (header->GetSuccessors().size() != 2) {
-    return;
+    return false;
   }
   HBasicBlock* exit = (header->GetSuccessors()[0] == body)
       ? header->GetSuccessors()[1]
       : header->GetSuccessors()[0];
   // Ensure exit can only be reached by exiting loop.
   if (exit->GetPredecessors().size() != 1) {
-    return;
+    return false;
   }
   // Detect either an empty loop (no side effects other than plain iteration) or
   // a trivial loop (just iterating once). Replace subsequent index uses, if any,
   // with the last value and remove the loop, possibly after unrolling its body.
-  HInstruction* phi = header->GetFirstPhi();
-  iset_->clear();  // prepare phi induction
-  if (TrySetSimpleLoopHeader(header)) {
+  HPhi* main_phi = nullptr;
+  if (TrySetSimpleLoopHeader(header, &main_phi)) {
     bool is_empty = IsEmptyBody(body);
-    if ((is_empty || trip_count == 1) &&
-        TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
+    if (reductions_->empty() &&  // TODO: possible with some effort
+        (is_empty || trip_count == 1) &&
+        TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) {
       if (!is_empty) {
         // Unroll the loop-body, which sees initial value of the index.
-        phi->ReplaceWith(phi->InputAt(0));
+        main_phi->ReplaceWith(main_phi->InputAt(0));
         preheader->MergeInstructionsWith(body);
       }
       body->DisconnectAndDelete();
@@ -566,21 +592,20 @@
       preheader->AddDominatedBlock(exit);
       exit->SetDominator(preheader);
       RemoveLoop(node);  // update hierarchy
-      return;
+      return true;
     }
   }
-
   // Vectorize loop, if possible and valid.
-  if (kEnableVectorization) {
-    iset_->clear();  // prepare phi induction
-    if (TrySetSimpleLoopHeader(header) &&
-        ShouldVectorize(node, body, trip_count) &&
-        TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
-      Vectorize(node, body, exit, trip_count);
-      graph_->SetHasSIMD(true);  // flag SIMD usage
-      return;
-    }
+  if (kEnableVectorization &&
+      TrySetSimpleLoopHeader(header, &main_phi) &&
+      reductions_->empty() &&  // TODO: possible with some effort
+      ShouldVectorize(node, body, trip_count) &&
+      TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) {
+    Vectorize(node, body, exit, trip_count);
+    graph_->SetHasSIMD(true);  // flag SIMD usage
+    return true;
   }
+  return false;
 }
 
 //
@@ -621,6 +646,8 @@
   // aliased, as well as the property that references either point to the same
   // array or to two completely disjoint arrays, i.e., no partial aliasing.
   // Other than a few simply heuristics, no detailed subscript analysis is done.
+  // The scan over references also finds a suitable dynamic loop peeling candidate.
+  const ArrayReference* candidate = nullptr;
   for (auto i = vector_refs_->begin(); i != vector_refs_->end(); ++i) {
     for (auto j = i; ++j != vector_refs_->end(); ) {
       if (i->type == j->type && (i->lhs || j->lhs)) {
@@ -656,7 +683,7 @@
   }
 
   // Consider dynamic loop peeling for alignment.
-  SetPeelingCandidate(trip_count);
+  SetPeelingCandidate(candidate, trip_count);
 
   // Success!
   return true;
@@ -679,14 +706,15 @@
   bool needs_cleanup = trip_count == 0 || (trip_count % chunk) != 0;
 
   // Adjust vector bookkeeping.
-  iset_->clear();  // prepare phi induction
-  bool is_simple_loop_header = TrySetSimpleLoopHeader(header);  // fills iset_
+  HPhi* main_phi = nullptr;
+  bool is_simple_loop_header = TrySetSimpleLoopHeader(header, &main_phi);  // refills sets
   DCHECK(is_simple_loop_header);
   vector_header_ = header;
   vector_body_ = block;
 
-  // Generate dynamic loop peeling trip count, if needed:
-  // ptc = <peeling-needed-for-candidate>
+  // Generate dynamic loop peeling trip count, if needed, under the assumption
+  // that the Android runtime guarantees at least "component size" alignment:
+  // ptc = (ALIGN - (&a[initial] % ALIGN)) / type-size
   HInstruction* ptc = nullptr;
   if (vector_peeling_candidate_ != nullptr) {
     DCHECK_LT(vector_length_, trip_count) << "dynamic peeling currently requires known trip count";
@@ -775,6 +803,7 @@
   while (!header->GetFirstInstruction()->IsGoto()) {
     header->RemoveInstruction(header->GetFirstInstruction());
   }
+
   // Update loop hierarchy: the old header now resides in the same outer loop
   // as the old preheader. Note that we don't bother putting sequential
   // loops back in the hierarchy at this point.
@@ -841,13 +870,12 @@
     vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
     Insert(vector_body_, vector_index_);
   }
-  // Finalize phi for the loop index.
+  // Finalize phi inputs for the loop index.
   phi->AddInput(lo);
   phi->AddInput(vector_index_);
   vector_index_ = phi;
 }
 
-// TODO: accept reductions at left-hand-side, mixed-type store idioms, etc.
 bool HLoopOptimization::VectorizeDef(LoopNode* node,
                                      HInstruction* instruction,
                                      bool generate_code) {
@@ -1118,7 +1146,7 @@
     case kArm:
     case kThumb2:
       // Allow vectorization for all ARM devices, because Android assumes that
-      // ARM 32-bit always supports advanced SIMD.
+      // ARM 32-bit always supports advanced SIMD (64-bit SIMD).
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
@@ -1137,7 +1165,7 @@
       return false;
     case kArm64:
       // Allow vectorization for all ARM devices, because Android assumes that
-      // ARMv8 AArch64 always supports advanced SIMD.
+      // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
@@ -1162,7 +1190,7 @@
       }
     case kX86:
     case kX86_64:
-      // Allow vectorization for SSE4-enabled X86 devices only (128-bit vectors).
+      // Allow vectorization for SSE4.1-enabled X86 devices only (128-bit SIMD).
       if (features->AsX86InstructionSetFeatures()->HasSSE4_1()) {
         switch (type) {
           case Primitive::kPrimBoolean:
@@ -1310,8 +1338,6 @@
           global_allocator_, base, opa, type, vector_length_, is_string_char_at);
     }
     // Known dynamically enforced alignment?
-    // TODO: detect offset + constant differences.
-    // TODO: long run, static alignment analysis?
     if (vector_peeling_candidate_ != nullptr &&
         vector_peeling_candidate_->base == base &&
         vector_peeling_candidate_->offset == offset) {
@@ -1586,9 +1612,11 @@
   return true;
 }
 
-void HLoopOptimization::SetPeelingCandidate(int64_t trip_count ATTRIBUTE_UNUSED) {
+void HLoopOptimization::SetPeelingCandidate(const ArrayReference* candidate,
+                                            int64_t trip_count ATTRIBUTE_UNUSED) {
   // Current heuristic: none.
   // TODO: implement
+  vector_peeling_candidate_ = candidate;
 }
 
 uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
@@ -1616,13 +1644,17 @@
 //
 
 bool HLoopOptimization::TrySetPhiInduction(HPhi* phi, bool restrict_uses) {
+  // Start with empty phi induction.
+  iset_->clear();
+
   // Special case Phis that have equivalent in a debuggable setup. Our graph checker isn't
   // smart enough to follow strongly connected components (and it's probably not worth
   // it to make it so). See b/33775412.
   if (graph_->IsDebuggable() && phi->HasEquivalentPhi()) {
     return false;
   }
-  DCHECK(iset_->empty());
+
+  // Lookup phi induction cycle.
   ArenaSet<HInstruction*>* set = induction_range_.LookupCycle(phi);
   if (set != nullptr) {
     for (HInstruction* i : *set) {
@@ -1634,6 +1666,7 @@
       } else if (!i->IsRemovable()) {
         return false;
       } else if (i != phi && restrict_uses) {
+        // Deal with regular uses.
         for (const HUseListNode<HInstruction*>& use : i->GetUses()) {
           if (set->find(use.GetUser()) == set->end()) {
             return false;
@@ -1647,17 +1680,65 @@
   return false;
 }
 
-// Find: phi: Phi(init, addsub)
-//       s:   SuspendCheck
-//       c:   Condition(phi, bound)
-//       i:   If(c)
-// TODO: Find a less pattern matching approach?
-bool HLoopOptimization::TrySetSimpleLoopHeader(HBasicBlock* block) {
+bool HLoopOptimization::TrySetPhiReduction(HPhi* phi) {
   DCHECK(iset_->empty());
-  HInstruction* phi = block->GetFirstPhi();
-  if (phi != nullptr &&
-      phi->GetNext() == nullptr &&
-      TrySetPhiInduction(phi->AsPhi(), /*restrict_uses*/ false)) {
+  // Only unclassified phi cycles are candidates for reductions.
+  if (induction_range_.IsClassified(phi)) {
+    return false;
+  }
+  // Accept operations like x = x + .., provided that the phi and the reduction are
+  // used exactly once inside the loop, and by each other.
+  HInputsRef inputs = phi->GetInputs();
+  if (inputs.size() == 2) {
+    HInstruction* reduction = inputs[1];
+    if (HasReductionFormat(reduction, phi)) {
+      HLoopInformation* loop_info = phi->GetBlock()->GetLoopInformation();
+      int32_t use_count = 0;
+      bool single_use_inside_loop =
+          // Reduction update only used by phi.
+          reduction->GetUses().HasExactlyOneElement() &&
+          !reduction->HasEnvironmentUses() &&
+          // Reduction update is only use of phi inside the loop.
+          IsOnlyUsedAfterLoop(loop_info, phi, /*collect_loop_uses*/ true, &use_count) &&
+          iset_->size() == 1;
+      iset_->clear();  // leave the way you found it
+      if (single_use_inside_loop) {
+        // Link reduction back, and start recording feed value.
+        reductions_->Put(reduction, phi);
+        reductions_->Put(phi, phi->InputAt(0));
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool HLoopOptimization::TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi) {
+  // Start with empty phi induction and reductions.
+  iset_->clear();
+  reductions_->clear();
+
+  // Scan the phis to find the following (the induction structure has already
+  // been optimized, so we don't need to worry about trivial cases):
+  // (1) optional reductions in loop,
+  // (2) the main induction, used in loop control.
+  HPhi* phi = nullptr;
+  for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
+    if (TrySetPhiReduction(it.Current()->AsPhi())) {
+      continue;
+    } else if (phi == nullptr) {
+      // Found the first candidate for main induction.
+      phi = it.Current()->AsPhi();
+    } else {
+      return false;
+    }
+  }
+
+  // Then test for a typical loopheader:
+  //   s:  SuspendCheck
+  //   c:  Condition(phi, bound)
+  //   i:  If(c)
+  if (phi != nullptr && TrySetPhiInduction(phi, /*restrict_uses*/ false)) {
     HInstruction* s = block->GetFirstInstruction();
     if (s != nullptr && s->IsSuspendCheck()) {
       HInstruction* c = s->GetNext();
@@ -1669,6 +1750,7 @@
         if (i != nullptr && i->IsIf() && i->InputAt(0) == c) {
           iset_->insert(c);
           iset_->insert(s);
+          *main_phi = phi;
           return true;
         }
       }
@@ -1692,6 +1774,7 @@
 
 bool HLoopOptimization::IsUsedOutsideLoop(HLoopInformation* loop_info,
                                           HInstruction* instruction) {
+  // Deal with regular uses.
   for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) {
     if (use.GetUser()->GetBlock()->GetLoopInformation() != loop_info) {
       return true;
@@ -1704,6 +1787,7 @@
                                             HInstruction* instruction,
                                             bool collect_loop_uses,
                                             /*out*/ int32_t* use_count) {
+  // Deal with regular uses.
   for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) {
     HInstruction* user = use.GetUser();
     if (iset_->find(user) == iset_->end()) {  // not excluded?
@@ -1729,6 +1813,7 @@
   // Try to replace outside uses with the last value.
   if (induction_range_.CanGenerateLastValue(instruction)) {
     HInstruction* replacement = induction_range_.GenerateLastValue(instruction, graph_, block);
+    // Deal with regular uses.
     const HUseList<HInstruction*>& uses = instruction->GetUses();
     for (auto it = uses.begin(), end = uses.end(); it != end;) {
       HInstruction* user = it->GetUser();
@@ -1744,6 +1829,7 @@
         induction_range_.Replace(user, instruction, replacement);  // update induction
       }
     }
+    // Deal with environment uses.
     const HUseList<HEnvironment*>& env_uses = instruction->GetEnvUses();
     for (auto it = env_uses.begin(), end = env_uses.end(); it != end;) {
       HEnvironment* user = it->GetUser();
@@ -1759,7 +1845,6 @@
         }
       }
     }
-    induction_simplication_count_++;
     return true;
   }
   return false;
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index de4bd85..49be8a3 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -104,18 +104,33 @@
     bool lhs;              // def/use
   };
 
+  //
   // Loop setup and traversal.
+  //
+
   void LocalRun();
   void AddLoop(HLoopInformation* loop_info);
   void RemoveLoop(LoopNode* node);
-  void TraverseLoopsInnerToOuter(LoopNode* node);
 
+  // Traverses all loops inner to outer to perform simplifications and optimizations.
+  // Returns true if loops nested inside current loop (node) have changed.
+  bool TraverseLoopsInnerToOuter(LoopNode* node);
+
+  //
   // Optimization.
+  //
+
   void SimplifyInduction(LoopNode* node);
   void SimplifyBlocks(LoopNode* node);
-  void OptimizeInnerLoop(LoopNode* node);
 
+  // Performs optimizations specific to inner loop (empty loop removal,
+  // unrolling, vectorization). Returns true if anything changed.
+  bool OptimizeInnerLoop(LoopNode* node);
+
+  //
   // Vectorization analysis and synthesis.
+  //
+
   bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
   void GenerateNewLoop(LoopNode* node,
@@ -155,12 +170,20 @@
 
   // Vectorization heuristics.
   bool IsVectorizationProfitable(int64_t trip_count);
-  void SetPeelingCandidate(int64_t trip_count);
+  void SetPeelingCandidate(const ArrayReference* candidate, int64_t trip_count);
   uint32_t GetUnrollingFactor(HBasicBlock* block, int64_t trip_count);
 
+  //
   // Helpers.
+  //
+
   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
-  bool TrySetSimpleLoopHeader(HBasicBlock* block);
+  bool TrySetPhiReduction(HPhi* phi);
+
+  // Detects loop header with a single induction (returned in main_phi), possibly
+  // other phis for reductions, but no other side effects. Returns true on success.
+  bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
+
   bool IsEmptyBody(HBasicBlock* block);
   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
                            HInstruction* instruction,
@@ -200,10 +223,12 @@
   // Contents reside in phase-local heap memory.
   ArenaSet<HInstruction*>* iset_;
 
-  // Counter that tracks how many induction cycles have been simplified. Useful
-  // to trigger incremental updates of induction variable analysis of outer loops
-  // when the induction of inner loops has changed.
-  uint32_t induction_simplication_count_;
+  // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
+  // (1) reductions in the loop-body are mapped back to their phi definition,
+  // (2) phi definitions are mapped to their initial value (updated during
+  //     code generation to feed the proper values into the new chain).
+  // Contents reside in phase-local heap memory.
+  ArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
 
   // Flag that tracks if any simplifications have occurred.
   bool simplified_;
diff --git a/test/661-checker-simd-reduc/expected.txt b/test/661-checker-simd-reduc/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/661-checker-simd-reduc/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/661-checker-simd-reduc/info.txt b/test/661-checker-simd-reduc/info.txt
new file mode 100644
index 0000000..4d071fb
--- /dev/null
+++ b/test/661-checker-simd-reduc/info.txt
@@ -0,0 +1 @@
+Functional tests on vectorization of the most basic reductions.
diff --git a/test/661-checker-simd-reduc/src/Main.java b/test/661-checker-simd-reduc/src/Main.java
new file mode 100644
index 0000000..741b5fa
--- /dev/null
+++ b/test/661-checker-simd-reduc/src/Main.java
@@ -0,0 +1,292 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for simple integral reductions: same type for accumulator and data.
+ */
+public class Main {
+
+  static final int N = 500;
+
+  //
+  // Basic reductions in loops.
+  //
+
+  // TODO: vectorize these (second step of b/64091002 plan)
+
+  private static byte reductionByte(byte[] x) {
+    byte sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  private static short reductionShort(short[] x) {
+    short sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  private static char reductionChar(char[] x) {
+    char sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  private static int reductionInt(int[] x) {
+    int sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  private static long reductionLong(long[] x) {
+    long sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  private static byte reductionMinusByte(byte[] x) {
+    byte sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum -= x[i];
+    }
+    return sum;
+  }
+
+  private static short reductionMinusShort(short[] x) {
+    short sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum -= x[i];
+    }
+    return sum;
+  }
+
+  private static char reductionMinusChar(char[] x) {
+    char sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum -= x[i];
+    }
+    return sum;
+  }
+
+  private static int reductionMinusInt(int[] x) {
+    int sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum -= x[i];
+    }
+    return sum;
+  }
+
+  private static long reductionMinusLong(long[] x) {
+    long sum = 0;
+    for (int i = 0; i < x.length; i++) {
+      sum -= x[i];
+    }
+    return sum;
+  }
+
+  private static byte reductionMinByte(byte[] x) {
+    byte min = Byte.MAX_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      min = (byte) Math.min(min, x[i]);
+    }
+    return min;
+  }
+
+  private static short reductionMinShort(short[] x) {
+    short min = Short.MAX_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      min = (short) Math.min(min, x[i]);
+    }
+    return min;
+  }
+
+  private static char reductionMinChar(char[] x) {
+    char min = Character.MAX_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      min = (char) Math.min(min, x[i]);
+    }
+    return min;
+  }
+
+  private static int reductionMinInt(int[] x) {
+    int min = Integer.MAX_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      min = Math.min(min, x[i]);
+    }
+    return min;
+  }
+
+  private static long reductionMinLong(long[] x) {
+    long min = Long.MAX_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      min = Math.min(min, x[i]);
+    }
+    return min;
+  }
+
+  private static byte reductionMaxByte(byte[] x) {
+    byte max = Byte.MIN_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      max = (byte) Math.max(max, x[i]);
+    }
+    return max;
+  }
+
+  private static short reductionMaxShort(short[] x) {
+    short max = Short.MIN_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      max = (short) Math.max(max, x[i]);
+    }
+    return max;
+  }
+
+  private static char reductionMaxChar(char[] x) {
+    char max = Character.MIN_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      max = (char) Math.max(max, x[i]);
+    }
+    return max;
+  }
+
+  private static int reductionMaxInt(int[] x) {
+    int max = Integer.MIN_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      max = Math.max(max, x[i]);
+    }
+    return max;
+  }
+
+  private static long reductionMaxLong(long[] x) {
+    long max = Long.MIN_VALUE;
+    for (int i = 0; i < x.length; i++) {
+      max = Math.max(max, x[i]);
+    }
+    return max;
+  }
+
+  //
+  // A few special cases.
+  //
+
+  // TODO: consider unrolling
+
+  private static int reductionInt10(int[] x) {
+    int sum = 0;
+    // Amenable to complete unrolling.
+    for (int i = 10; i <= 10; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  private static int reductionMinusInt10(int[] x) {
+    int sum = 0;
+    // Amenable to complete unrolling.
+    for (int i = 10; i <= 10; i++) {
+      sum -= x[i];
+    }
+    return sum;
+  }
+
+  private static int reductionMinInt10(int[] x) {
+    int min = Integer.MAX_VALUE;
+    // Amenable to complete unrolling.
+    for (int i = 10; i <= 10; i++) {
+      min = Math.min(min, x[i]);
+    }
+    return min;
+  }
+
+  private static int reductionMaxInt10(int[] x) {
+    int max = Integer.MIN_VALUE;
+    // Amenable to complete unrolling.
+    for (int i = 10; i <= 10; i++) {
+      max = Math.max(max, x[i]);
+    }
+    return max;
+  }
+
+  //
+  // Main driver.
+  //
+
+  public static void main(String[] args) {
+    byte[] xb = new byte[N];
+    short[] xs = new short[N];
+    char[] xc = new char[N];
+    int[] xi = new int[N];
+    long[] xl = new long[N];
+    for (int i = 0, k = -17; i < N; i++, k += 3) {
+      xb[i] = (byte) k;
+      xs[i] = (short) k;
+      xc[i] = (char) k;
+      xi[i] = k;
+      xl[i] = k;
+    }
+
+    // Test various reductions in loops.
+    expectEquals(-74, reductionByte(xb));
+    expectEquals(-27466, reductionShort(xs));
+    expectEquals(38070, reductionChar(xc));
+    expectEquals(365750, reductionInt(xi));
+    expectEquals(365750L, reductionLong(xl));
+    expectEquals(74, reductionMinusByte(xb));
+    expectEquals(27466, reductionMinusShort(xs));
+    expectEquals(27466, reductionMinusChar(xc));
+    expectEquals(-365750, reductionMinusInt(xi));
+    expectEquals(-365750L, reductionMinusLong(xl));
+    expectEquals(-128, reductionMinByte(xb));
+    expectEquals(-17, reductionMinShort(xs));
+    expectEquals(1, reductionMinChar(xc));
+    expectEquals(-17, reductionMinInt(xi));
+    expectEquals(-17L, reductionMinLong(xl));
+    expectEquals(127, reductionMaxByte(xb));
+    expectEquals(1480, reductionMaxShort(xs));
+    expectEquals(65534, reductionMaxChar(xc));
+    expectEquals(1480, reductionMaxInt(xi));
+    expectEquals(1480L, reductionMaxLong(xl));
+
+    // Test special cases.
+    expectEquals(13, reductionInt10(xi));
+    expectEquals(-13, reductionMinusInt10(xi));
+    expectEquals(13, reductionMinInt10(xi));
+    expectEquals(13, reductionMaxInt10(xi));
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}