Opt compiler: Implement parallel move resolver without using swap.

The algorithm of ParallelMoveResolverNoSwap() is almost the same with
ParallelMoveResolverWithSwap(), except the way we resolve the circular
dependency. NoSwap() uses additional scratch register to resolve the
circular dependency. For example, (0->1) (1->2) (2->0) will be performed
as (2->scratch) (1->2) (0->1) (scratch->0).

On architectures without swap register support, NoSwap() can reduce the
number of moves from 3x(N-1) to (N+1) when there is circular dependency
with N moves.

And also, NoSwap() algorithm does not depend on architecture register
layout information, which means it can support register pairs on arm32
and X/W, D/S registers on arm64 without additional modification.

Change-Id: Idf56bd5469bb78c0e339e43ab16387428a082318
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index ad92ca5..54ea6f1 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -17,11 +17,23 @@
 
 #include "parallel_move_resolver.h"
 #include "nodes.h"
-#include "locations.h"
 
 namespace art {
 
-void ParallelMoveResolver::EmitNativeCode(HParallelMove* parallel_move) {
+void ParallelMoveResolver::BuildInitialMoveList(HParallelMove* parallel_move) {
+  // Perform a linear sweep of the moves to add them to the initial list of
+  // moves to perform, ignoring any move that is redundant (the source is
+  // the same as the destination, the destination is ignored and
+  // unallocated, or the move was already eliminated).
+  for (size_t i = 0; i < parallel_move->NumMoves(); ++i) {
+    MoveOperands* move = parallel_move->MoveOperandsAt(i);
+    if (!move->IsRedundant()) {
+      moves_.Add(move);
+    }
+  }
+}
+
+void ParallelMoveResolverWithSwap::EmitNativeCode(HParallelMove* parallel_move) {
   DCHECK(moves_.IsEmpty());
   // Build up a worklist of moves.
   BuildInitialMoveList(parallel_move);
@@ -50,20 +62,6 @@
   moves_.Reset();
 }
 
-
-void ParallelMoveResolver::BuildInitialMoveList(HParallelMove* parallel_move) {
-  // Perform a linear sweep of the moves to add them to the initial list of
-  // moves to perform, ignoring any move that is redundant (the source is
-  // the same as the destination, the destination is ignored and
-  // unallocated, or the move was already eliminated).
-  for (size_t i = 0; i < parallel_move->NumMoves(); ++i) {
-    MoveOperands* move = parallel_move->MoveOperandsAt(i);
-    if (!move->IsRedundant()) {
-      moves_.Add(move);
-    }
-  }
-}
-
 Location LowOf(Location location) {
   if (location.IsRegisterPair()) {
     return Location::RegisterLocation(location.low());
@@ -103,7 +101,7 @@
   }
 }
 
-MoveOperands* ParallelMoveResolver::PerformMove(size_t index) {
+MoveOperands* ParallelMoveResolverWithSwap::PerformMove(size_t index) {
   // Each call to this function performs a move and deletes it from the move
   // graph.  We first recursively perform any move blocking this one.  We
   // mark a move as "pending" on entry to PerformMove in order to detect
@@ -229,7 +227,7 @@
   }
 }
 
-bool ParallelMoveResolver::IsScratchLocation(Location loc) {
+bool ParallelMoveResolverWithSwap::IsScratchLocation(Location loc) {
   for (size_t i = 0; i < moves_.Size(); ++i) {
     if (moves_.Get(i)->Blocks(loc)) {
       return false;
@@ -245,10 +243,10 @@
   return false;
 }
 
-int ParallelMoveResolver::AllocateScratchRegister(int blocked,
-                                                  int register_count,
-                                                  int if_scratch,
-                                                  bool* spilled) {
+int ParallelMoveResolverWithSwap::AllocateScratchRegister(int blocked,
+                                                          int register_count,
+                                                          int if_scratch,
+                                                          bool* spilled) {
   DCHECK_NE(blocked, if_scratch);
   int scratch = -1;
   for (int reg = 0; reg < register_count; ++reg) {
@@ -269,8 +267,8 @@
 }
 
 
-ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
-    ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers)
+ParallelMoveResolverWithSwap::ScratchRegisterScope::ScratchRegisterScope(
+    ParallelMoveResolverWithSwap* resolver, int blocked, int if_scratch, int number_of_registers)
     : resolver_(resolver),
       reg_(kNoRegister),
       spilled_(false) {
@@ -282,10 +280,271 @@
 }
 
 
-ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() {
+ParallelMoveResolverWithSwap::ScratchRegisterScope::~ScratchRegisterScope() {
   if (spilled_) {
     resolver_->RestoreScratch(reg_);
   }
 }
 
+void ParallelMoveResolverNoSwap::EmitNativeCode(HParallelMove* parallel_move) {
+  DCHECK_EQ(GetNumberOfPendingMoves(), 0u);
+  DCHECK(moves_.IsEmpty());
+  DCHECK(scratches_.IsEmpty());
+
+  // Backend dependent initialization.
+  PrepareForEmitNativeCode();
+
+  // Build up a worklist of moves.
+  BuildInitialMoveList(parallel_move);
+
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& move = *moves_.Get(i);
+    // Skip constants to perform them last. They don't block other moves and
+    // skipping such moves with register destinations keeps those registers
+    // free for the whole algorithm.
+    if (!move.IsEliminated() && !move.GetSource().IsConstant()) {
+      PerformMove(i);
+    }
+  }
+
+  // Perform the moves with constant sources and register destinations with UpdateMoveSource()
+  // to reduce the number of literal loads. Stack destinations are skipped since we won't be benefit
+  // from changing the constant sources to stack locations.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    MoveOperands* move = moves_.Get(i);
+    Location destination = move->GetDestination();
+    if (!move->IsEliminated() && !destination.IsStackSlot() && !destination.IsDoubleStackSlot()) {
+      Location source = move->GetSource();
+      EmitMove(i);
+      move->Eliminate();
+      // This may introduce additional instruction dependency, but reduce number
+      // of moves and possible literal loads. For example,
+      // Original moves:
+      //   1234.5678 -> D0
+      //   1234.5678 -> D1
+      // Updated moves:
+      //   1234.5678 -> D0
+      //   D0 -> D1
+      UpdateMoveSource(source, destination);
+    }
+  }
+
+  // Perform the rest of the moves.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    MoveOperands* move = moves_.Get(i);
+    if (!move->IsEliminated()) {
+      EmitMove(i);
+      move->Eliminate();
+    }
+  }
+
+  // All pending moves that we have added for resolve cycles should be performed.
+  DCHECK_EQ(GetNumberOfPendingMoves(), 0u);
+
+  // Backend dependent cleanup.
+  FinishEmitNativeCode();
+
+  moves_.Reset();
+  scratches_.Reset();
+}
+
+Location ParallelMoveResolverNoSwap::GetScratchLocation(Location::Kind kind) {
+  for (size_t i = 0; i < scratches_.Size(); ++i) {
+    Location loc = scratches_.Get(i);
+    if (loc.GetKind() == kind && !IsBlockedByMoves(loc)) {
+      return loc;
+    }
+  }
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    Location loc = moves_.Get(i)->GetDestination();
+    if (loc.GetKind() == kind && !IsBlockedByMoves(loc)) {
+      return loc;
+    }
+  }
+  return Location::NoLocation();
+}
+
+void ParallelMoveResolverNoSwap::AddScratchLocation(Location loc) {
+  if (kIsDebugBuild) {
+    for (size_t i = 0; i < scratches_.Size(); ++i) {
+      DCHECK(!loc.Equals(scratches_.Get(i)));
+    }
+  }
+  scratches_.Add(loc);
+}
+
+void ParallelMoveResolverNoSwap::RemoveScratchLocation(Location loc) {
+  DCHECK(!IsBlockedByMoves(loc));
+  for (size_t i = 0; i < scratches_.Size(); ++i) {
+    if (loc.Equals(scratches_.Get(i))) {
+      scratches_.DeleteAt(i);
+      break;
+    }
+  }
+}
+
+void ParallelMoveResolverNoSwap::PerformMove(size_t index) {
+  // Each call to this function performs a move and deletes it from the move
+  // graph. We first recursively perform any move blocking this one. We mark
+  // a move as "pending" on entry to PerformMove in order to detect cycles
+  // in the move graph. We use scratch location to resolve cycles, also
+  // additional pending moves might be added. After move has been performed,
+  // we will update source operand in the move graph to reduce dependencies in
+  // the graph.
+
+  MoveOperands* move = moves_.Get(index);
+  DCHECK(!move->IsPending());
+  DCHECK(!move->IsEliminated());
+  if (move->IsRedundant()) {
+    // Previous operations on the list of moves have caused this particular move
+    // to become a no-op, so we can safely eliminate it. Consider for example
+    // (0 -> 1) (1 -> 0) (1 -> 2). There is a cycle (0 -> 1) (1 -> 0), that we will
+    // resolve as (1 -> scratch) (0 -> 1) (scratch -> 0). If, by chance, '2' is
+    // used as the scratch location, the move (1 -> 2) will occur while resolving
+    // the cycle. When that move is emitted, the code will update moves with a '1'
+    // as their source to use '2' instead (see `UpdateMoveSource()`. In our example
+    // the initial move (1 -> 2) would then become the no-op (2 -> 2) that can be
+    // eliminated here.
+    move->Eliminate();
+    return;
+  }
+
+  // Clear this move's destination to indicate a pending move. The actual
+  // destination is saved in a stack-allocated local. Recursion may allow
+  // multiple moves to be pending.
+  DCHECK(!move->GetSource().IsInvalid());
+  Location destination = move->MarkPending();
+
+  // Perform a depth-first traversal of the move graph to resolve
+  // dependencies. Any unperformed, unpending move with a source the same
+  // as this one's destination blocks this one so recursively perform all
+  // such moves.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& other_move = *moves_.Get(i);
+    if (other_move.Blocks(destination) && !other_move.IsPending()) {
+      PerformMove(i);
+    }
+  }
+
+  // We are about to resolve this move and don't need it marked as
+  // pending, so restore its destination.
+  move->ClearPending(destination);
+
+  // No one else should write to the move destination when the it is pending.
+  DCHECK(!move->IsRedundant());
+
+  Location source = move->GetSource();
+  // The move may be blocked on several pending moves, in case we have a cycle.
+  if (IsBlockedByMoves(destination)) {
+    // For a cycle like: (A -> B) (B -> C) (C -> A), we change it to following
+    // sequence:
+    // (C -> scratch)     # Emit right now.
+    // (A -> B) (B -> C)  # Unblocked.
+    // (scratch -> A)     # Add to pending_moves_, blocked by (A -> B).
+    Location::Kind kind = source.GetKind();
+    DCHECK_NE(kind, Location::kConstant);
+    Location scratch = AllocateScratchLocationFor(kind);
+    // We only care about the move size.
+    Primitive::Type type = move->Is64BitMove() ? Primitive::kPrimLong : Primitive::kPrimInt;
+    // Perform (C -> scratch)
+    move->SetDestination(scratch);
+    EmitMove(index);
+    move->Eliminate();
+    UpdateMoveSource(source, scratch);
+    // Add (scratch -> A).
+    AddPendingMove(scratch, destination, type);
+  } else {
+    // This move is not blocked.
+    EmitMove(index);
+    move->Eliminate();
+    UpdateMoveSource(source, destination);
+  }
+
+  // Moves in the pending list should not block any other moves. But performing
+  // unblocked moves in the pending list can free scratch registers, so we do this
+  // as early as possible.
+  MoveOperands* pending_move;
+  while ((pending_move = GetUnblockedPendingMove(source)) != nullptr) {
+    Location pending_source = pending_move->GetSource();
+    Location pending_destination = pending_move->GetDestination();
+    // We do not depend on the pending move index. So just delete the move instead
+    // of eliminating it to make the pending list cleaner.
+    DeletePendingMove(pending_move);
+    move->SetSource(pending_source);
+    move->SetDestination(pending_destination);
+    EmitMove(index);
+    move->Eliminate();
+    UpdateMoveSource(pending_source, pending_destination);
+    // Free any unblocked locations in the scratch location list.
+    for (size_t i = 0; i < scratches_.Size(); ++i) {
+      Location scratch = scratches_.Get(i);
+      // Only scratch overlapping with performed move source can be unblocked.
+      if (scratch.OverlapsWith(pending_source) && !IsBlockedByMoves(scratch)) {
+        FreeScratchLocation(pending_source);
+      }
+    }
+  }
+}
+
+void ParallelMoveResolverNoSwap::UpdateMoveSource(Location from, Location to) {
+  // This function is used to reduce the dependencies in the graph after
+  // (from -> to) has been performed. Since we ensure there is no move with the same
+  // destination, (to -> X) can not be blocked while (from -> X) might still be
+  // blocked. Consider for example the moves (0 -> 1) (1 -> 2) (1 -> 3). After
+  // (1 -> 2) has been performed, the moves left are (0 -> 1) and (1 -> 3). There is
+  // a dependency between the two. If we update the source location from 1 to 2, we
+  // will get (0 -> 1) and (2 -> 3). There is no dependency between the two.
+  //
+  // This is not something we must do, but we can use fewer scratch locations with
+  // this trick. For example, we can avoid using additional scratch locations for
+  // moves (0 -> 1), (1 -> 2), (1 -> 0).
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    MoveOperands* move = moves_.Get(i);
+    if (move->GetSource().Equals(from)) {
+      move->SetSource(to);
+    }
+  }
+}
+
+void ParallelMoveResolverNoSwap::AddPendingMove(Location source,
+    Location destination, Primitive::Type type) {
+  pending_moves_.Add(new (allocator_) MoveOperands(source, destination, type, nullptr));
+}
+
+void ParallelMoveResolverNoSwap::DeletePendingMove(MoveOperands* move) {
+  pending_moves_.Delete(move);
+}
+
+MoveOperands* ParallelMoveResolverNoSwap::GetUnblockedPendingMove(Location loc) {
+  for (size_t i = 0; i < pending_moves_.Size(); ++i) {
+    MoveOperands* move = pending_moves_.Get(i);
+    Location destination = move->GetDestination();
+    // Only moves with destination overlapping with input loc can be unblocked.
+    if (destination.OverlapsWith(loc) && !IsBlockedByMoves(destination)) {
+      return move;
+    }
+  }
+  return nullptr;
+}
+
+bool ParallelMoveResolverNoSwap::IsBlockedByMoves(Location loc) {
+  for (size_t i = 0; i < pending_moves_.Size(); ++i) {
+    if (pending_moves_.Get(i)->Blocks(loc)) {
+      return true;
+    }
+  }
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    if (moves_.Get(i)->Blocks(loc)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// So far it is only used for debugging purposes to make sure all pending moves
+// have been performed.
+size_t ParallelMoveResolverNoSwap::GetNumberOfPendingMoves() {
+  return pending_moves_.Size();
+}
+
 }  // namespace art