Revert "Revert "Avoid scratch register exhaustion during ARM64 stack slot moves.""

This reverts commit 122ceb492f37e97d022cba2221a87368f1847f5f.

Test: m test-art-target on ARM64, with and without read barriers/heap poisoning.
Bug: 32545705
Change-Id: I4b447d762082eea8edfabeb070317d274e2f5bd0
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 13616db..5c33fe1 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1533,8 +1533,17 @@
       DCHECK(source.IsStackSlot() || source.IsDoubleStackSlot());
       DCHECK(source.IsDoubleStackSlot() == destination.IsDoubleStackSlot());
       UseScratchRegisterScope temps(GetVIXLAssembler());
-      // There is generally less pressure on FP registers.
-      FPRegister temp = destination.IsDoubleStackSlot() ? temps.AcquireD() : temps.AcquireS();
+      // Use any scratch register (a core or a floating-point one)
+      // from VIXL scratch register pools as a temporary.
+      //
+      // We used to only use the FP scratch register pool, but in some
+      // rare cases the only register from this pool (D31) would
+      // already be used (e.g. within a ParallelMove instruction, when
+      // a move is blocked by a another move requiring a scratch FP
+      // register, which would reserve D31). To prevent this issue, we
+      // ask for a scratch register of any type (core or FP).
+      CPURegister temp =
+          temps.AcquireCPURegisterOfSize(destination.IsDoubleStackSlot() ? kXRegSize : kWRegSize);
       __ Ldr(temp, StackOperandFrom(source));
       __ Str(temp, StackOperandFrom(destination));
     }