Revert of SkOnce: 2 bytes -> 1 byte (patchset #4 id:60001 of https://codereview.chromium.org/1904483003/ )

Reason for revert:
bust the roll

Original issue's description:
> SkOnce: 2 bytes -> 1 byte
>
> This uses the same logic we worked out for SkOncePtr to reduce
> the memory footprint of SkOnce from a done byte and lock byte
> to a single 3-state byte:
>
>   - NotStarted: no thread has tried to run fn() yet
>   - Active:     a thread is running fn()
>   - Done:       fn() is complete
>
> Threads which see Done return immediately.
> Threads which see NotStarted try to move to Active, run fn(), then move to Done.
> Threads which see Active spin until the active thread moves to Done.
>
> This additionally fixes a too-weak memory order bug in SkOncePtr,
> and adds a big note to explain.
>
> BUG=skia:
> GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1904483003
>
> Committed: https://skia.googlesource.com/skia/+/df02d338be8e3c1c50b48a3a9faa582703a39c07

TBR=herb@google.com
# Skipping CQ checks because original CL landed less than 1 days ago.
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=skia:

Review URL: https://codereview.chromium.org/1898413004
diff --git a/include/private/SkOnce.h b/include/private/SkOnce.h
index 20bf427..d83b63f 100644
--- a/include/private/SkOnce.h
+++ b/include/private/SkOnce.h
@@ -8,9 +8,9 @@
 #ifndef SkOnce_DEFINED
 #define SkOnce_DEFINED
 
+#include "../private/SkSpinlock.h"
 #include <atomic>
 #include <utility>
-#include "SkTypes.h"
 
 // SkOnce provides call-once guarantees for Skia, much like std::once_flag/std::call_once().
 //
@@ -21,50 +21,20 @@
 public:
     template <typename Fn, typename... Args>
     void operator()(Fn&& fn, Args&&... args) {
-        auto state = fState.load(std::memory_order_acquire);
-
-        if (state == Done) {
-            return;
-        }
-
-        if (state == NotStarted) {
-            // Try to claim the job of calling fn() by swapping from NotStarted to Calling.
-            // See [1] below for why we use std::memory_order_acquire instead of relaxed.
-            if (fState.compare_exchange_strong(state, Calling, std::memory_order_acquire)) {
-                // Claimed!  Call fn(), then mark this SkOnce as Done.
+        // Vanilla double-checked locking.
+        if (!fDone.load(std::memory_order_acquire)) {
+            fLock.acquire();
+            if (!fDone.load(std::memory_order_relaxed)) {
                 fn(std::forward<Args>(args)...);
-                return fState.store(Done, std::memory_order_release);
+                fDone.store(true, std::memory_order_release);
             }
+            fLock.release();
         }
-
-        while (state == Calling) {
-            // Some other thread is calling fn().  Wait for them to finish.
-            state = fState.load(std::memory_order_acquire);
-        }
-        SkASSERT(state == Done);
     }
 
 private:
-    enum State : uint8_t { NotStarted, Calling, Done};
-    std::atomic<State> fState{NotStarted};
+    std::atomic<bool> fDone{false};
+    SkSpinlock        fLock;
 };
 
-/* [1]  Why do we compare_exchange_strong() with std::memory_order_acquire instead of relaxed?
- *
- * If we succeed, we really only need a relaxed compare_exchange_strong()... we're the ones
- * who are about to do a release store, so there's certainly nothing yet for an acquire to
- * synchronize with.
- *
- * If that compare_exchange_strong() fails, we're either in Calling or Done state.
- * Again, if we're in Calling state, relaxed would have been fine: the spin loop will
- * acquire up to the Calling thread's release store.
- *
- * But if that compare_exchange_strong() fails and we find ourselves in the Done state,
- * we've never done an acquire load to sync up to the store of that Done state.
- *
- * So on failure we need an acquire load.  Generally the failure memory order cannot be
- * stronger than the success memory order, so we need acquire on success too.  The single
- * memory order version of compare_exchange_strong() uses the same acquire order for both.
- */
-
 #endif  // SkOnce_DEFINED
diff --git a/include/private/SkOncePtr.h b/include/private/SkOncePtr.h
index b60d968..3c1ab63 100644
--- a/include/private/SkOncePtr.h
+++ b/include/private/SkOncePtr.h
@@ -66,9 +66,8 @@
             if (state == 0) {
                 // It looks like no one has tried to create our pointer yet.
                 // We try to claim that task by atomically swapping our state from '0' to '1'.
-                // See SkOnce.h for why we use an acquire memory order here rather than relaxed.
                 if (sk_atomic_compare_exchange(
-                    &fState, &state, (uintptr_t)1, sk_memory_order_acquire, sk_memory_order_acquire)) {
+                    &fState, &state, (uintptr_t)1, sk_memory_order_relaxed, sk_memory_order_relaxed)) {
                     // We've claimed it.  Create our pointer and store it into fState.
                     state = (uintptr_t)f();
                     SkASSERT(state > 1);