[AArch64] Fix halfword load merging for big-endian targets

For big-endian targets, when we merge two halfword loads into a word load, the
order of the halfwords in the loaded value is reversed compared to
little-endian, so the load-store optimiser needs to swap the destination
registers.

This does not affect merging of two word loads, as we use ldp, which treats the
memory as two separate 32-bit words.

llvm-svn: 252597
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 6ef4c26..ffe6ab2 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -84,6 +84,7 @@
 
   const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
+  const AArch64Subtarget *Subtarget;
 
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
@@ -537,6 +538,10 @@
     if (!IsUnscaled)
       OffsetImm /= 2;
     MachineInstr *RtNewDest = MergeForward ? I : Paired;
+    // When merging small (< 32 bit) loads for big-endian targets, the order of
+    // the component parts gets swapped.
+    if (!Subtarget->isLittleEndian())
+      std::swap(RtMI, Rt2MI);
     // Construct the new load instruction.
     // FIXME: currently we support only halfword unsigned load. We need to
     // handle byte type, signed, and store instructions as well.
@@ -560,7 +565,7 @@
     DEBUG((NewMemMI)->print(dbgs()));
 
     MachineInstr *ExtDestMI = MergeForward ? Paired : I;
-    if (ExtDestMI == Rt2MI) {
+    if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
       // Create the bitfield extract for high half.
       BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
                           TII->get(AArch64::UBFMWri))
@@ -1388,8 +1393,9 @@
 }
 
 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
-  TRI = Fn.getSubtarget().getRegisterInfo();
+  Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
+  TRI = Subtarget->getRegisterInfo();
 
   bool Modified = false;
   bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);