Add a new implementation of reduceOpsTaskSplitting

Currently this doesn't actually handle going over the memory
budget, but it does carve out space to do that later. This algorithm
can't handle everything yet but I want to get it landed for
more iteration as long as it's disabled.

Bug: skia:10877
Change-Id: I37942172345e8cfd6fc2c591a3788a10652377da
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/345168
Commit-Queue: Adlai Holler <adlai@google.com>
Reviewed-by: Robert Phillips <robertphillips@google.com>
Reviewed-by: Brian Salomon <bsalomon@google.com>
Auto-Submit: Adlai Holler <adlai@google.com>
diff --git a/gn/gpu.gni b/gn/gpu.gni
index b1586c4..46d3147 100644
--- a/gn/gpu.gni
+++ b/gn/gpu.gni
@@ -234,6 +234,7 @@
   "$_src/gpu/GrSwizzle.cpp",
   "$_src/gpu/GrSwizzle.h",
   "$_src/gpu/GrTBlockList.h",
+  "$_src/gpu/GrTCluster.h",
   "$_src/gpu/GrTTopoSort.h",
   "$_src/gpu/GrTestUtils.cpp",
   "$_src/gpu/GrTestUtils.h",
diff --git a/gn/tests.gni b/gn/tests.gni
index 318e223..71a443f 100644
--- a/gn/tests.gni
+++ b/gn/tests.gni
@@ -118,6 +118,7 @@
   "$_tests/GrSubmittedFlushTest.cpp",
   "$_tests/GrSurfaceTest.cpp",
   "$_tests/GrTBlockListTest.cpp",
+  "$_tests/GrTClusterTest.cpp",
   "$_tests/GrTextBlobTest.cpp",
   "$_tests/GrTextureMipMapInvalidationTest.cpp",
   "$_tests/GrThreadSafeCacheTest.cpp",
diff --git a/include/gpu/GrContextOptions.h b/include/gpu/GrContextOptions.h
index 25fbf4f..ea59eca 100644
--- a/include/gpu/GrContextOptions.h
+++ b/include/gpu/GrContextOptions.h
@@ -175,8 +175,6 @@
 
     /**
      * Experimental: Allow Ganesh to more aggressively reorder operations.
-     * Eventually this will just be what is done and will not be optional.
-     * Note: This option currently has no effect while we update its implementation.
      */
     Enable fReduceOpsTaskSplitting = Enable::kDefault;
 
diff --git a/src/gpu/GrDrawingManager.cpp b/src/gpu/GrDrawingManager.cpp
index 4f8180b..8ca4553 100644
--- a/src/gpu/GrDrawingManager.cpp
+++ b/src/gpu/GrDrawingManager.cpp
@@ -15,6 +15,7 @@
 #include "include/gpu/GrDirectContext.h"
 #include "include/gpu/GrRecordingContext.h"
 #include "src/core/SkDeferredDisplayListPriv.h"
+#include "src/core/SkTInternalLList.h"
 #include "src/gpu/GrAuditTrail.h"
 #include "src/gpu/GrClientMappedBufferManager.h"
 #include "src/gpu/GrCopyRenderTask.h"
@@ -32,6 +33,7 @@
 #include "src/gpu/GrSurfaceContext.h"
 #include "src/gpu/GrSurfaceDrawContext.h"
 #include "src/gpu/GrSurfaceProxyPriv.h"
+#include "src/gpu/GrTCluster.h"
 #include "src/gpu/GrTTopoSort.h"
 #include "src/gpu/GrTexture.h"
 #include "src/gpu/GrTextureProxy.h"
@@ -135,6 +137,11 @@
     fActiveOpsTask = nullptr;
 
     this->sortTasks();
+
+    if (fReduceOpsTaskSplitting) {
+        this->reorderTasks();
+    }
+
     if (!fCpuBufferCache) {
         // We cache more buffers when the backend is using client side arrays. Otherwise, we
         // expect each pool will use a CPU buffer as a staging buffer before uploading to a GPU
@@ -415,6 +422,32 @@
 #endif
 }
 
+// Reorder the array to match the llist without reffing & unreffing sk_sp's.
+// Both args must contain the same objects.
+// This is basically a shim because clustering uses LList but the rest of drawmgr uses array.
+template <typename T>
+static void reorder_array_by_llist(const SkTInternalLList<T>& llist, SkTArray<sk_sp<T>>* array) {
+    int i = 0;
+    for (T* t : llist) {
+        // Release the pointer that used to live here so it doesn't get unreffed.
+        [[maybe_unused]] T* old = array->at(i).release();
+        array->at(i++).reset(t);
+    }
+    SkASSERT(i == array->count());
+}
+
+void GrDrawingManager::reorderTasks() {
+    SkASSERT(fReduceOpsTaskSplitting);
+    SkTInternalLList<GrRenderTask> llist;
+    bool clustered = GrTCluster<GrRenderTask, GrRenderTask::ClusterTraits>(fDAG, &llist);
+    if (!clustered) {
+        return;
+    }
+    // TODO: Handle case where proposed order would blow our memory budget.
+    // Such cases are currently pathological, so we could just return here and keep current order.
+    reorder_array_by_llist(llist, &fDAG);
+}
+
 void GrDrawingManager::closeAllTasks() {
     const GrCaps& caps = *fContext->priv().caps();
     for (auto& task : fDAG) {
diff --git a/src/gpu/GrDrawingManager.h b/src/gpu/GrDrawingManager.h
index d199ee0..72c6c15 100644
--- a/src/gpu/GrDrawingManager.h
+++ b/src/gpu/GrDrawingManager.h
@@ -135,6 +135,7 @@
     void removeRenderTasks(int startIndex, int stopIndex);
 
     void sortTasks();
+    void reorderTasks();
 
     void closeAllTasks();
 
diff --git a/src/gpu/GrRenderTask.cpp b/src/gpu/GrRenderTask.cpp
index 460454a..b1a4b15 100644
--- a/src/gpu/GrRenderTask.cpp
+++ b/src/gpu/GrRenderTask.cpp
@@ -229,7 +229,7 @@
 }
 
 #ifdef SK_DEBUG
-bool GrRenderTask::isDependedent(const GrRenderTask* dependent) const {
+bool GrRenderTask::isDependent(const GrRenderTask* dependent) const {
     for (int i = 0; i < fDependents.count(); ++i) {
         if (fDependents[i] == dependent) {
             return true;
@@ -243,7 +243,7 @@
     // TODO: check for loops and duplicates
 
     for (int i = 0; i < fDependencies.count(); ++i) {
-        SkASSERT(fDependencies[i]->isDependedent(this));
+        SkASSERT(fDependencies[i]->isDependent(this));
     }
 }
 #endif
diff --git a/src/gpu/GrRenderTask.h b/src/gpu/GrRenderTask.h
index 08c6253..4b23b71 100644
--- a/src/gpu/GrRenderTask.h
+++ b/src/gpu/GrRenderTask.h
@@ -10,6 +10,7 @@
 
 #include "include/core/SkRefCnt.h"
 #include "include/private/SkTArray.h"
+#include "src/core/SkTInternalLList.h"
 #include "src/gpu/GrSurfaceProxyView.h"
 #include "src/gpu/GrTextureProxy.h"
 #include "src/gpu/GrTextureResolveManager.h"
@@ -122,6 +123,9 @@
     // it is required)?
     bool isInstantiated() const;
 
+    // Used by GrTCluster.
+    SK_DECLARE_INTERNAL_LLIST_INTERFACE(GrRenderTask);
+
 protected:
     SkDEBUGCODE(bool deferredProxiesAreInstantiated() const;)
 
@@ -189,7 +193,7 @@
 
     void addDependency(GrRenderTask* dependedOn);
     void addDependent(GrRenderTask* dependent);
-    SkDEBUGCODE(bool isDependedent(const GrRenderTask* dependent) const;)
+    SkDEBUGCODE(bool isDependent(const GrRenderTask* dependent) const;)
     SkDEBUGCODE(void validate() const;)
     void closeThoseWhoDependOnMe(const GrCaps&);
 
@@ -223,6 +227,23 @@
         }
     };
 
+    struct ClusterTraits {
+        static int NumTargets(GrRenderTask* renderTask) {
+            return renderTask->numTargets();
+        }
+        static uint32_t GetTarget(GrRenderTask* renderTask, int index) {
+            return renderTask->target(index).proxy()->uniqueID().asUInt();
+        }
+        static int NumDependencies(const GrRenderTask* renderTask) {
+            return renderTask->fDependencies.count();
+        }
+        static GrRenderTask* Dependency(GrRenderTask* renderTask, int index) {
+            return renderTask->fDependencies[index];
+        }
+        static uint32_t GetID(GrRenderTask* renderTask) {
+            return renderTask->uniqueID();
+        }
+    };
 
     virtual void onPrePrepare(GrRecordingContext*) {} // Only the GrOpsTask currently overrides this
     virtual void onPrepare(GrOpFlushState*) {} // Only GrOpsTask and GrDDLTask override this virtual
diff --git a/src/gpu/GrTCluster.h b/src/gpu/GrTCluster.h
new file mode 100644
index 0000000..2d7d5cf
--- /dev/null
+++ b/src/gpu/GrTCluster.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrTCluster_DEFINED
+#define GrTCluster_DEFINED
+
+#include "include/core/SkRefCnt.h"
+#include "include/private/SkTHash.h"
+#include "src/core/SkScopeExit.h"
+#include "src/core/SkSpan.h"
+#include "src/core/SkTInternalLList.h"
+
+// Uncomment to get lots of logging.
+#define CLUSTER_DEBUGF(...) //SkDebugf(__VA_ARGS__)
+
+#ifdef SK_DEBUG
+template <typename T, typename Traits>
+SkString GrTCluster_DebugStr(T* t) {
+    if (Traits::NumTargets(t) != 1) {
+        return SkStringPrintf("%d", Traits::GetID(t));
+    } else {
+        return SkStringPrintf("%d(%d)", Traits::GetID(t), Traits::GetTarget(t, 0));
+    }
+}
+
+template <typename T, typename Traits>
+SkString GrTCluster_DebugStr(SkSpan<const sk_sp<T>> collection) {
+    SkString s;
+    for (const sk_sp<T>& t_sp : collection) {
+        s.appendf("%s ", GrTCluster_DebugStr<T, Traits>(t_sp.get()).c_str());
+    }
+    return s;
+}
+
+template <typename T, typename Traits>
+SkString GrTCluster_DebugStr(const SkTInternalLList<T>& collection) {
+    SkString s;
+    for (T* t : collection) {
+        s.appendf("%s ", GrTCluster_DebugStr<T, Traits>(t).c_str());
+    }
+    return s;
+}
+
+template<typename T, typename Traits>
+void GrTCluster_Validate(SkSpan<const sk_sp<T>> input, const SkTInternalLList<T>& llist) {
+    // Check that we didn't break dependencies.
+    SkTHashSet<T*> seen;
+    for (T* t : llist) {
+        seen.add(t);
+        for (int i = 0; i < Traits::NumDependencies(t); i++) {
+            T* dep = Traits::Dependency(t, i);
+            SkASSERTF(seen.contains(dep),
+                      "%s came before dependency %s",
+                      GrTCluster_DebugStr<T, Traits>(t).c_str(),
+                      GrTCluster_DebugStr<T, Traits>(dep).c_str());
+        }
+    }
+    // Check that llist has the same entries as the input.
+    for (const auto& orig : input) {
+        seen.remove(orig.get());
+    }
+    SkASSERT(seen.empty());
+}
+
+#endif  // SK_DEBUG
+
+// Returns whether reordering occurred.
+template <typename T, typename Traits>
+bool GrTCluster_Visit(T* task, SkTInternalLList<T>* llist,
+                      SkTHashMap<uint32_t, T*>* lastTaskMap) {
+    SkScopeExit exit([&]{
+        llist->addToTail(task);
+        CLUSTER_DEBUGF("Cluster: Output order is now: %s\n",
+                       GrTCluster_DebugStr<T, Traits>(*llist).c_str());
+    });
+    CLUSTER_DEBUGF("Cluster: ***Step***\nLooking at %s\n",
+                   GrTCluster_DebugStr<T, Traits>(task).c_str());
+    if (Traits::NumTargets(task) != 1) {
+        CLUSTER_DEBUGF("Cluster: %d targets. Emitting barriers.\n", Traits::NumTargets(task));
+        // Tasks with 0 or multiple targets are treated as full barriers
+        // for all their targets.
+        for (int j = 0; j < Traits::NumTargets(task); j++) {
+            lastTaskMap->remove(Traits::GetTarget(task, j));
+        }
+        return false;
+    }
+
+    uint32_t target = Traits::GetTarget(task, 0);
+    T* clusterTail = (lastTaskMap->find(target) ? *lastTaskMap->find(target) : nullptr);
+    lastTaskMap->set(target, task);
+
+    if (!clusterTail) {
+        CLUSTER_DEBUGF("Cluster: Bail, no cluster to extend.\n", SkToInt(target));
+        return false;
+    }
+
+    CLUSTER_DEBUGF("Cluster: clusterTail is %s.\n",
+                   GrTCluster_DebugStr<T, Traits>(clusterTail).c_str());
+
+    if (clusterTail == llist->tail()) {
+        CLUSTER_DEBUGF("Cluster: Bail, cluster is already tail.\n");
+        return false;
+    }
+    T* movedHead = clusterTail->fNext;
+
+    // Now, let's refer to the "cluster" as the chain of tasks with the same target, that we're
+    // hoping to extend by reordering. Let "moved tasks" be the ones we want to move to extend the
+    // cluster.
+    T* clusterHead = clusterTail;
+    while (clusterHead->fPrev
+           && 1 == Traits::NumTargets(clusterHead->fPrev)
+           && target == Traits::GetTarget(clusterHead->fPrev, 0)) {
+        clusterHead = clusterHead->fPrev;
+    }
+
+    // We can't reorder if any moved task depends on anything in the cluster.
+    // Collect the cluster into a hash set.
+    // TODO: Is this overkill? Is a linear search appropriate for small clusters?
+    SkTHashSet<T*> clusterSet;
+    for (T* t = clusterHead; t != movedHead; t = t->fNext) {
+        clusterSet.add(t);
+    }
+    // Then check the constraint.
+    for (T* moved = movedHead; moved; moved = moved->fNext) {
+        for (int j = 0; j < Traits::NumDependencies(moved); j++) {
+            T* dep = Traits::Dependency(moved, j);
+            if (clusterSet.contains(dep)) {
+                CLUSTER_DEBUGF("Cluster: Bail, %s depends on %s.\n",
+                               GrTCluster_DebugStr<T, Traits>(moved).c_str(),
+                               GrTCluster_DebugStr<T, Traits>(dep).c_str());
+                return false;
+            }
+        }
+    }
+
+    // Grab the moved tasks and pull them before clusterHead.
+    for (T* moved = movedHead; moved;) {
+        CLUSTER_DEBUGF("Cluster: Reorder %s behind %s.\n",
+                       GrTCluster_DebugStr<T, Traits>(moved).c_str(),
+                       GrTCluster_DebugStr<T, Traits>(clusterHead).c_str());
+        // Be careful to save fNext before each move.
+        T* nextMoved = moved->fNext;
+        llist->remove(moved);
+        llist->addBefore(moved, clusterHead);
+        moved = nextMoved;
+    }
+    return true;
+}
+
+// Take a topologically-sorted DAG and cluster the entries together while maintaining the
+// dependency order.
+//
+// If no clustering is possible, returns false.
+// Otherwise, returns true and populates the provided llist as such:
+//   - Contains the same set of entries as `input`.
+//   - Obeys the dependency rules in `input`.
+//   - Places entries with the same Target (see Traits) adjacent to each other.
+//   - Entries with multiple targets act as reordering barriers for all their targets.
+//
+// T must declare a public SkTInternalLList interface.
+//
+// Traits requires:
+//
+//   static int NumTargets(const T* t) { ... }      //
+//   static uint32_t GetTarget(const T* t, int i)   //
+//   static int NumDependencies(const T* t) { ... } //
+//   static T* Dependency(T* t, int index) { ... }  //
+//   static uint32_t GetID(T* t) { ... }            //
+template <typename T, typename Traits = T>
+bool GrTCluster(SkSpan<const sk_sp<T>> input, SkTInternalLList<T>* llist) {
+    SkASSERT(llist->isEmpty());
+
+    if (input.count() < 3) {
+        return false;
+    }
+
+    CLUSTER_DEBUGF("Cluster: Original order is %s\n",
+                   GrTCluster_DebugStr<T, Traits>(input).c_str());
+
+    SkTHashMap<uint32_t, T*> lastTaskMap;
+    bool didReorder = false;
+    for (const auto& t : input) {
+        didReorder |= GrTCluster_Visit<T, Traits>(t.get(), llist, &lastTaskMap);
+    }
+
+#ifdef SK_DEBUG
+    if (didReorder) {
+        GrTCluster_Validate<T, Traits>(input, *llist);
+    }
+#endif
+
+    return didReorder;
+}
+
+#undef CLUSTER_DEBUGF
+
+#endif
diff --git a/tests/GrTClusterTest.cpp b/tests/GrTClusterTest.cpp
new file mode 100644
index 0000000..237085d
--- /dev/null
+++ b/tests/GrTClusterTest.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "src/gpu/GrTCluster.h"
+#include "tests/Test.h"
+
+#include "tools/ToolUtils.h"
+
+using ToolUtils::TopoTestNode;
+
+typedef void (*CreateGraphPF)(SkTArray<sk_sp<TopoTestNode>>* graph,
+                              SkTArray<sk_sp<TopoTestNode>>* expected);
+
+/*
+ * In:  A1 B1 A2
+ * Out: B1 A1 A2
+ */
+static void create_graph0(SkTArray<sk_sp<TopoTestNode>>* graph,
+                          SkTArray<sk_sp<TopoTestNode>>* expected) {
+    ToolUtils::TopoTestNode::AllocNodes(graph, 3);
+
+    graph->at(0)->targets(0);
+    graph->at(1)->targets(1);
+    graph->at(2)->targets(0);
+    graph->at(2)->dependsOn(graph->at(0).get());
+
+    expected->push_back(graph->at(1));
+    expected->push_back(graph->at(0));
+    expected->push_back(graph->at(2));
+}
+
+/*
+ * In:  A1 B1 A2 C1 A3
+ * Out: B1 C1 A1 A2 A3
+ */
+static void create_graph1(SkTArray<sk_sp<TopoTestNode>>* graph,
+                          SkTArray<sk_sp<TopoTestNode>>* expected) {
+    ToolUtils::TopoTestNode::AllocNodes(graph, 5);
+
+    graph->at(0)->targets(0);
+    graph->at(1)->targets(1);
+    graph->at(2)->targets(0);
+    graph->at(3)->targets(2);
+    graph->at(4)->targets(0);
+
+    expected->push_back(graph->at(1));
+    expected->push_back(graph->at(3));
+    expected->push_back(graph->at(0));
+    expected->push_back(graph->at(2));
+    expected->push_back(graph->at(4));
+}
+
+/*
+ * In:   A1 B1 A2.
+ * Srcs: A1->B1, B1->A2.
+ * Out:  A1 B1 A2. Can't reorder.
+ */
+static void create_graph2(SkTArray<sk_sp<TopoTestNode>>* graph,
+                          SkTArray<sk_sp<TopoTestNode>>* expected) {
+    ToolUtils::TopoTestNode::AllocNodes(graph, 3);
+
+    graph->at(0)->targets(0);
+    graph->at(1)->targets(1);
+    graph->at(2)->targets(0);
+
+    graph->at(1)->dependsOn(graph->at(0).get());
+    graph->at(2)->dependsOn(graph->at(1).get());
+
+    // expected is empty. Can't reorder.
+}
+
+DEF_TEST(GrTCluster, reporter) {
+    CreateGraphPF tests[] = {
+        create_graph0,
+        create_graph1,
+        create_graph2
+    };
+
+    for (size_t i = 0; i < SK_ARRAY_COUNT(tests); ++i) {
+        SkTArray<sk_sp<TopoTestNode>> graph;
+        SkTArray<sk_sp<TopoTestNode>> expectedOutput;
+
+        (tests[i])(&graph, &expectedOutput);
+
+        SkTInternalLList<TopoTestNode> llist;
+        bool actualResult = GrTCluster<TopoTestNode>(graph, &llist);
+
+        if (expectedOutput.empty()) {
+            REPORTER_ASSERT(reporter, !actualResult);
+        } else {
+            REPORTER_ASSERT(reporter, actualResult);
+            // SkTInternalLList::countEntries is debug-only and these tests run in release.
+            int newCount = 0;
+            for ([[maybe_unused]] TopoTestNode* t : llist) {
+                newCount++;
+            }
+            REPORTER_ASSERT(reporter, newCount == expectedOutput.count());
+
+            int j = 0;
+            for (TopoTestNode* n : llist) {
+                REPORTER_ASSERT(reporter, n == expectedOutput[j++].get());
+            }
+        }
+
+        //SkDEBUGCODE(print(graph);)
+    }
+}
diff --git a/tools/ToolUtils.h b/tools/ToolUtils.h
index e141663..e655e8d 100644
--- a/tools/ToolUtils.h
+++ b/tools/ToolUtils.h
@@ -27,6 +27,7 @@
 #include "include/private/SkTArray.h"
 #include "include/private/SkTDArray.h"
 #include "include/utils/SkRandom.h"
+#include "src/core/SkTInternalLList.h"
 
 class SkBitmap;
 class SkCanvas;
@@ -156,6 +157,7 @@
     TopoTestNode(int id) : fID(id) {}
 
     void dependsOn(TopoTestNode* src) { *fDependencies.append() = src; }
+    void targets(uint32_t target) { *fTargets.append() = target; }
 
     int  id() const { return fID; }
     void reset() {
@@ -203,6 +205,9 @@
     static TopoTestNode* Dependency(TopoTestNode* node, int index) {
         return node->fDependencies[index];
     }
+    static int           NumTargets(TopoTestNode* node) { return node->fTargets.count(); }
+    static uint32_t      GetTarget(TopoTestNode* node, int i) { return node->fTargets[i]; }
+    static uint32_t      GetID(TopoTestNode* node) { return node->id(); }
 
     // Helper functions for TopoSortBench & TopoSortTest
     static void AllocNodes(SkTArray<sk_sp<ToolUtils::TopoTestNode>>* graph, int num) {
@@ -231,6 +236,8 @@
         }
     }
 
+    SK_DECLARE_INTERNAL_LLIST_INTERFACE(TopoTestNode);
+
 private:
     int      fID;
     uint32_t fOutputPos = 0;
@@ -238,6 +245,7 @@
     bool     fWasOutput = false;
 
     SkTDArray<TopoTestNode*> fDependencies;
+    SkTDArray<uint32_t>      fTargets;
 };
 
 template <typename T>