Implemented stack symbol table ordering/packing optimization to improve data locality and code size from SP/FP offset encoding.
Differential Revision: http://reviews.llvm.org/D15393
llvm-svn: 260917
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 8a4df42..14a2d08 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -707,8 +707,10 @@
Offset, MaxAlign, Skew);
}
- // Then assign frame offsets to stack objects that are not used to spill
- // callee saved registers.
+ SmallVector<int, 8> ObjectsToAllocate;
+
+ // Then prepare to assign frame offsets to stack objects that are not used to
+ // spill callee saved registers.
for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
if (MFI->isObjectPreAllocated(i) &&
MFI->getUseLocalStackAllocationBlock())
@@ -724,8 +726,17 @@
if (ProtectedObjs.count(i))
continue;
- AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
+ // Add the objects that we need to allocate to our working set.
+ ObjectsToAllocate.push_back(i);
}
+ // Give the targets a chance to order the objects the way they like it.
+ if (Fn.getTarget().getOptLevel() != CodeGenOpt::None &&
+ Fn.getTarget().Options.StackSymbolOrdering)
+ TFI.orderFrameObjects(Fn, ObjectsToAllocate);
+
+ // Now walk the objects and actually assign base offsets to them.
+ for (auto &Object : ObjectsToAllocate)
+ AdjustStackOffset(MFI, Object, StackGrowsDown, Offset, MaxAlign, Skew);
// Make sure the special register scavenging spill slot is closest to the
// stack pointer.
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index c394383..6e48654 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2669,6 +2669,148 @@
return MBBI;
}
+namespace {
+// Struct used by orderFrameObjects to help sort the stack objects.
+struct X86FrameSortingObject {
+ bool IsValid = false; // true if we care about this Object.
+ unsigned ObjectIndex = 0; // Index of Object into MFI list.
+ unsigned ObjectSize = 0; // Size of Object in bytes.
+ unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
+ unsigned ObjectNumUses = 0; // Object static number of uses.
+};
+
+// The comparison function we use for std::sort to order our local
+// stack symbols. The current algorithm is to use an estimated
+// "density". This takes into consideration the size and number of
+// uses each object has in order to roughly minimize code size.
+// So, for example, an object of size 16B that is referenced 5 times
+// will get higher priority than 4 4B objects referenced 1 time each.
+// It's not perfect and we may be able to squeeze a few more bytes out of
+// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
+// fringe end can have special consideration, given their size is less
+// important, etc.), but the algorithmic complexity grows too much to be
+// worth the extra gains we get. This gets us pretty close.
+// The final order leaves us with objects with highest priority going
+// at the end of our list.
+struct X86FrameSortingComparator {
+ inline bool operator()(const X86FrameSortingObject &A,
+ const X86FrameSortingObject &B) {
+ uint64_t DensityAScaled, DensityBScaled;
+
+ // For consistency in our comparison, all invalid objects are placed
+ // at the end. This also allows us to stop walking when we hit the
+ // first invalid item after it's all sorted.
+ if (!A.IsValid)
+ return false;
+ if (!B.IsValid)
+ return true;
+
+ // The density is calculated by doing :
+ // (double)DensityA = A.ObjectNumUses / A.ObjectSize
+ // (double)DensityB = B.ObjectNumUses / B.ObjectSize
+ // Since this approach may cause inconsistencies in
+ // the floating point <, >, == comparisons, depending on the floating
+ // point model with which the compiler was built, we're going
+ // to scale both sides by multiplying with
+ // A.ObjectSize * B.ObjectSize. This ends up factoring away
+ // the division and, with it, the need for any floating point
+ // arithmetic.
+ DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
+ static_cast<uint64_t>(B.ObjectSize);
+ DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
+ static_cast<uint64_t>(A.ObjectSize);
+
+ // If the two densities are equal, prioritize highest alignment
+ // objects. This allows for similar alignment objects
+ // to be packed together (given the same density).
+ // There's room for improvement here, also, since we can pack
+ // similar alignment (different density) objects next to each
+ // other to save padding. This will also require further
+ // complexity/iterations, and the overall gain isn't worth it,
+ // in general. Something to keep in mind, though.
+ if (DensityAScaled == DensityBScaled)
+ return A.ObjectAlignment < B.ObjectAlignment;
+
+ return DensityAScaled < DensityBScaled;
+ }
+};
+} // namespace
+
+// Order the symbols in the local stack.
+// We want to place the local stack objects in some sort of sensible order.
+// The heuristic we use is to try and pack them according to static number
+// of uses and size of object in order to minimize code size.
+void X86FrameLowering::orderFrameObjects(
+ const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ // Don't waste time if there's nothing to do.
+ if (ObjectsToAllocate.empty())
+ return;
+
+ // Create an array of all MFI objects. We won't need all of these
+ // objects, but we're going to create a full array of them to make
+ // it easier to index into when we're counting "uses" down below.
+ // We want to be able to easily/cheaply access an object by simply
+ // indexing into it, instead of having to search for it every time.
+ std::vector<X86FrameSortingObject> SortingObjects(MFI->getObjectIndexEnd());
+
+ // Walk the objects we care about and mark them as such in our working
+ // struct.
+ for (auto &Obj : ObjectsToAllocate) {
+ SortingObjects[Obj].IsValid = true;
+ SortingObjects[Obj].ObjectIndex = Obj;
+ SortingObjects[Obj].ObjectAlignment = MFI->getObjectAlignment(Obj);
+ // Set the size.
+ int ObjectSize = MFI->getObjectSize(Obj);
+ if (ObjectSize == 0)
+ // Variable size. Just use 4.
+ SortingObjects[Obj].ObjectSize = 4;
+ else
+ SortingObjects[Obj].ObjectSize = ObjectSize;
+ }
+
+ // Count the number of uses for each object.
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ for (const MachineOperand &MO : MI.operands()) {
+ // Check to see if it's a local stack symbol.
+ if (!MO.isFI())
+ continue;
+ int Index = MO.getIndex();
+ // Check to see if it falls within our range, and is tagged
+ // to require ordering.
+ if (Index >= 0 && Index < MFI->getObjectIndexEnd() &&
+ SortingObjects[Index].IsValid)
+ SortingObjects[Index].ObjectNumUses++;
+ }
+ }
+ }
+
+ // Sort the objects using X86FrameSortingAlgorithm (see its comment for
+ // info).
+ std::stable_sort(SortingObjects.begin(), SortingObjects.end(),
+ X86FrameSortingComparator());
+
+ // Now modify the original list to represent the final order that
+ // we want. The order will depend on whether we're going to access them
+ // from the stack pointer or the frame pointer. For SP, the list should
+ // end up with the END containing objects that we want with smaller offsets.
+ // For FP, it should be flipped.
+ int i = 0;
+ for (auto &Obj : SortingObjects) {
+ // All invalid items are sorted at the end, so it's safe to stop.
+ if (!Obj.IsValid)
+ break;
+ ObjectsToAllocate[i++] = Obj.ObjectIndex;
+ }
+
+ // Flip it if we're accessing off of the FP.
+ if (!TRI->needsStackRealignment(MF) && hasFP(MF))
+ std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
+}
+
+
unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
// RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
unsigned Offset = 16;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 3ab41b4..e2a488c 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -137,6 +137,13 @@
/// Returns true if the target will correctly handle shrink wrapping.
bool enableShrinkWrapping(const MachineFunction &MF) const override;
+ /// Order the symbols in the local stack.
+ /// We want to place the local stack objects in some sort of sensible order.
+ /// The heuristic we use is to try and pack them according to static number
+ /// of uses and size in order to minimize code size.
+ void orderFrameObjects(const MachineFunction &MF,
+ SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
/// convertArgMovsToPushes - This method tries to convert a call sequence
/// that uses sub and mov instructions to put the argument onto the stack
/// into a series of pushes.