SkOnce: 2 bytes -> 1 byte

This uses the same logic we worked out for SkOncePtr to reduce
the memory footprint of SkOnce from a done byte and lock byte
to a single 3-state byte:

  - NotStarted: no thread has tried to run fn() yet
  - Active:     a thread is running fn()
  - Done:       fn() is complete

Threads which see Done return immediately.
Threads which see NotStarted try to move to Active, run fn(), then move to Done.
Threads which see Active spin until the active thread moves to Done.

This additionally fixes a too-weak memory order bug in SkOncePtr,
and adds a big note to explain.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1904483003

Review URL: https://codereview.chromium.org/1904483003
diff --git a/include/private/SkOnce.h b/include/private/SkOnce.h
index d83b63f..20bf427 100644
--- a/include/private/SkOnce.h
+++ b/include/private/SkOnce.h
@@ -8,9 +8,9 @@
 #ifndef SkOnce_DEFINED
 #define SkOnce_DEFINED
 
-#include "../private/SkSpinlock.h"
 #include <atomic>
 #include <utility>
+#include "SkTypes.h"
 
 // SkOnce provides call-once guarantees for Skia, much like std::once_flag/std::call_once().
 //
@@ -21,20 +21,50 @@
 public:
     template <typename Fn, typename... Args>
     void operator()(Fn&& fn, Args&&... args) {
-        // Vanilla double-checked locking.
-        if (!fDone.load(std::memory_order_acquire)) {
-            fLock.acquire();
-            if (!fDone.load(std::memory_order_relaxed)) {
-                fn(std::forward<Args>(args)...);
-                fDone.store(true, std::memory_order_release);
-            }
-            fLock.release();
+        auto state = fState.load(std::memory_order_acquire);
+
+        if (state == Done) {
+            return;
         }
+
+        if (state == NotStarted) {
+            // Try to claim the job of calling fn() by swapping from NotStarted to Calling.
+            // See [1] below for why we use std::memory_order_acquire instead of relaxed.
+            if (fState.compare_exchange_strong(state, Calling, std::memory_order_acquire)) {
+                // Claimed!  Call fn(), then mark this SkOnce as Done.
+                fn(std::forward<Args>(args)...);
+                return fState.store(Done, std::memory_order_release);
+            }
+        }
+
+        while (state == Calling) {
+            // Some other thread is calling fn().  Wait for them to finish.
+            state = fState.load(std::memory_order_acquire);
+        }
+        SkASSERT(state == Done);
     }
 
 private:
-    std::atomic<bool> fDone{false};
-    SkSpinlock        fLock;
+    enum State : uint8_t { NotStarted, Calling, Done};
+    std::atomic<State> fState{NotStarted};
 };
 
+/* [1]  Why do we compare_exchange_strong() with std::memory_order_acquire instead of relaxed?
+ *
+ * If we succeed, we really only need a relaxed compare_exchange_strong()... we're the ones
+ * who are about to do a release store, so there's certainly nothing yet for an acquire to
+ * synchronize with.
+ *
+ * If that compare_exchange_strong() fails, we're either in Calling or Done state.
+ * Again, if we're in Calling state, relaxed would have been fine: the spin loop will
+ * acquire up to the Calling thread's release store.
+ *
+ * But if that compare_exchange_strong() fails and we find ourselves in the Done state,
+ * we've never done an acquire load to sync up to the store of that Done state.
+ *
+ * So on failure we need an acquire load.  Generally the failure memory order cannot be
+ * stronger than the success memory order, so we need acquire on success too.  The single
+ * memory order version of compare_exchange_strong() uses the same acquire order for both.
+ */
+
 #endif  // SkOnce_DEFINED
diff --git a/include/private/SkOncePtr.h b/include/private/SkOncePtr.h
index 3c1ab63..b60d968 100644
--- a/include/private/SkOncePtr.h
+++ b/include/private/SkOncePtr.h
@@ -66,8 +66,9 @@
             if (state == 0) {
                 // It looks like no one has tried to create our pointer yet.
                 // We try to claim that task by atomically swapping our state from '0' to '1'.
+                // See SkOnce.h for why we use an acquire memory order here rather than relaxed.
                 if (sk_atomic_compare_exchange(
-                    &fState, &state, (uintptr_t)1, sk_memory_order_relaxed, sk_memory_order_relaxed)) {
+                    &fState, &state, (uintptr_t)1, sk_memory_order_acquire, sk_memory_order_acquire)) {
                     // We've claimed it.  Create our pointer and store it into fState.
                     state = (uintptr_t)f();
                     SkASSERT(state > 1);