Refactor PackageWatchdog explicit health checks

Improvements:

1. Queuing PackageWatchdog requests to startObserving packages:
When observing packages with the watchdog, we needed to get
the packages supporting explicit health checks so we can decide if a
package should be passing or not. This prevents us from receiving
requests to monitor packages during early boot, before third party
packages are ready. In this change we don't depend on ExtServices to
be up to startObserving, we initially treat all package as failing a
health check and lazily syncRequests to request or cancel explicit
health checks based on the currently observed packages. When we receive
onSupportedPackages, we mark the packages that don't support health
checks as passing.

2. Lazy binding to the explicit health check service:
We were always bound to the explicit health check
service regardless of whether we are expecting requests or not, we need
to be able to bind and unbind dynamically to improve device resource
usage. In this change, we bind as soon as we make a request and are
expecting results, we unbind otherwise.

3. Fixed Races:
There were a couple of potential races that could lead to exceptions
that could bring the system server down, e.g when the service is
transitioning between disconnected and connected state (maybe it
crashed) or when ExtServices is being upated and is down or early
boot requests when third party apps are not ready. This change fixes such.

4. Logging:
We improved the logging wording and order and made it more consistent

Bug: 120598832
Test: Manual tests. Stress tested behavior by killing extservices and
making requests simultaneously

function killproc {
    while true
    do
        local pid=$(adb shell pidof $1)
        if [[ ! -z $pid ]]
        then
            echo $pid
            adb shell kill $pid
        fi
    done;
}

adb install-multi-package -i com.android.shell --enable-rollback \
NetworkStack.apk ModuleMetadataGoogle.apk

Also switched between enabled and disabled states to verify packages
are handled correctly. Will automate these tests in later cl

atest PackageWatchdogTest
Change-Id: Iafaef553e95d107f700109f9a8328950a5e2bf71
diff --git a/services/core/java/com/android/server/ExplicitHealthCheckController.java b/services/core/java/com/android/server/ExplicitHealthCheckController.java
index 164837a..27ad208 100644
--- a/services/core/java/com/android/server/ExplicitHealthCheckController.java
+++ b/services/core/java/com/android/server/ExplicitHealthCheckController.java
@@ -41,10 +41,14 @@
 import com.android.internal.annotations.GuardedBy;
 import com.android.internal.util.Preconditions;
 
+import java.util.Collection;
 import java.util.Collections;
+import java.util.Iterator;
 import java.util.List;
+import java.util.Set;
 import java.util.function.Consumer;
 
+// TODO(b/120598832): Add tests
 /**
  * Controls the connections with {@link ExplicitHealthCheckService}.
  */
@@ -53,18 +57,30 @@
     private final Object mLock = new Object();
     private final Context mContext;
 
-    // Called everytime the service is connected, so the watchdog can sync it's state with
-    // the health check service. In practice, should never be null after it has been #setEnabled.
-    @GuardedBy("mLock") @Nullable private Runnable mOnConnected;
     // Called everytime a package passes the health check, so the watchdog is notified of the
     // passing check. In practice, should never be null after it has been #setEnabled.
+    // To prevent deadlocks between the controller and watchdog threads, we have
+    // a lock invariant to ALWAYS acquire the PackageWatchdog#mLock before #mLock in this class.
+    // It's easier to just NOT hold #mLock when calling into watchdog code on this consumer.
     @GuardedBy("mLock") @Nullable private Consumer<String> mPassedConsumer;
+    // Called everytime after a successful #syncRequest call, so the watchdog can receive packages
+    // supporting health checks and update its internal state. In practice, should never be null
+    // after it has been #setEnabled.
+    // To prevent deadlocks between the controller and watchdog threads, we have
+    // a lock invariant to ALWAYS acquire the PackageWatchdog#mLock before #mLock in this class.
+    // It's easier to just NOT hold #mLock when calling into watchdog code on this consumer.
+    @GuardedBy("mLock") @Nullable private Consumer<List<String>> mSupportedConsumer;
+    // Called everytime we need to notify the watchdog to sync requests between itself and the
+    // health check service. In practice, should never be null after it has been #setEnabled.
+    // To prevent deadlocks between the controller and watchdog threads, we have
+    // a lock invariant to ALWAYS acquire the PackageWatchdog#mLock before #mLock in this class.
+    // It's easier to just NOT hold #mLock when calling into watchdog code on this runnable.
+    @GuardedBy("mLock") @Nullable private Runnable mNotifySyncRunnable;
     // Actual binder object to the explicit health check service.
     @GuardedBy("mLock") @Nullable private IExplicitHealthCheckService mRemoteService;
-    // Cache for packages supporting explicit health checks. This cache should not change while
-    // the health check service is running.
-    @GuardedBy("mLock") @Nullable private List<String> mSupportedPackages;
-    // Connection to the explicit health check service, necessary to unbind
+    // Connection to the explicit health check service, necessary to unbind.
+    // We should only try to bind if mConnection is null, non-null indicates we
+    // are connected or at least connecting.
     @GuardedBy("mLock") @Nullable private ServiceConnection mConnection;
     // Bind state of the explicit health check service.
     @GuardedBy("mLock") private boolean mEnabled;
@@ -73,23 +89,117 @@
         mContext = context;
     }
 
+    /** Enables or disables explicit health checks. */
+    public void setEnabled(boolean enabled) {
+        synchronized (mLock) {
+            Slog.i(TAG, "Explicit health checks " + (enabled ? "enabled." : "disabled."));
+            mEnabled = enabled;
+        }
+    }
+
+    /**
+     * Sets callbacks to listen to important events from the controller.
+     *
+     * <p> Should be called once at initialization before any other calls to the controller to
+     * ensure a happens-before relationship of the set parameters and visibility on other threads.
+     */
+    public void setCallbacks(Consumer<String> passedConsumer,
+            Consumer<List<String>> supportedConsumer, Runnable notifySyncRunnable) {
+        synchronized (mLock) {
+            if (mPassedConsumer != null || mSupportedConsumer != null
+                    || mNotifySyncRunnable != null) {
+                Slog.wtf(TAG, "Resetting health check controller callbacks");
+            }
+
+            mPassedConsumer = Preconditions.checkNotNull(passedConsumer);
+            mSupportedConsumer = Preconditions.checkNotNull(supportedConsumer);
+            mNotifySyncRunnable = Preconditions.checkNotNull(notifySyncRunnable);
+        }
+    }
+
+    /**
+     * Calls the health check service to request or cancel packages based on
+     * {@code newRequestedPackages}.
+     *
+     * <p> Supported packages in {@code newRequestedPackages} that have not been previously
+     * requested will be requested while supported packages not in {@code newRequestedPackages}
+     * but were previously requested will be cancelled.
+     *
+     * <p> This handles binding and unbinding to the health check service as required.
+     *
+     * <p> Note, calling this may modify {@code newRequestedPackages}.
+     *
+     * <p> Note, this method is not thread safe, all calls should be serialized.
+     */
+    public void syncRequests(Set<String> newRequestedPackages) {
+        boolean enabled;
+        synchronized (mLock) {
+            enabled = mEnabled;
+        }
+
+        if (!enabled) {
+            Slog.i(TAG, "Health checks disabled, no supported packages");
+            // Call outside lock
+            mSupportedConsumer.accept(Collections.emptyList());
+            return;
+        }
+
+        getSupportedPackages(supportedPackages -> {
+            // Notify the watchdog without lock held
+            mSupportedConsumer.accept(supportedPackages);
+            getRequestedPackages(previousRequestedPackages -> {
+                synchronized (mLock) {
+                    // Hold lock so requests and cancellations are sent atomically.
+                    // It is important we don't mix requests from multiple threads.
+
+                    // Note, this may modify newRequestedPackages
+                    newRequestedPackages.retainAll(supportedPackages);
+
+                    // Cancel packages no longer requested
+                    actOnDifference(previousRequestedPackages,
+                            newRequestedPackages, p -> cancel(p));
+                    // Request packages not yet requested
+                    actOnDifference(newRequestedPackages,
+                            previousRequestedPackages, p -> request(p));
+
+                    if (newRequestedPackages.isEmpty()) {
+                        Slog.i(TAG, "No more health check requests, unbinding...");
+                        unbindService();
+                        return;
+                    }
+                }
+            });
+        });
+    }
+
+    private void actOnDifference(Collection<String> collection1, Collection<String> collection2,
+            Consumer<String> action) {
+        Iterator<String> iterator = collection1.iterator();
+        while (iterator.hasNext()) {
+            String packageName = iterator.next();
+            if (!collection2.contains(packageName)) {
+                action.accept(packageName);
+            }
+        }
+    }
+
     /**
      * Requests an explicit health check for {@code packageName}.
      * After this request, the callback registered on {@link #setCallbacks} can receive explicit
      * health check passed results.
-     *
-     * @throws IllegalStateException if the service is not started
      */
-    public void request(String packageName) throws RemoteException {
+    private void request(String packageName) {
         synchronized (mLock) {
-            if (!mEnabled) {
+            if (!prepareServiceLocked("request health check for " + packageName)) {
                 return;
             }
 
-            enforceServiceReadyLocked();
-
             Slog.i(TAG, "Requesting health check for package " + packageName);
-            mRemoteService.request(packageName);
+            try {
+                mRemoteService.request(packageName);
+            } catch (RemoteException e) {
+                Slog.w(TAG, "Failed to request health check for package " + packageName, e);
+            }
         }
     }
 
@@ -97,46 +207,46 @@
      * Cancels all explicit health checks for {@code packageName}.
      * After this request, the callback registered on {@link #setCallbacks} can no longer receive
      * explicit health check passed results.
-     *
-     * @throws IllegalStateException if the service is not started
      */
-    public void cancel(String packageName) throws RemoteException {
+    private void cancel(String packageName) {
         synchronized (mLock) {
-            if (!mEnabled) {
+            if (!prepareServiceLocked("cancel health check for " + packageName)) {
                 return;
             }
 
-            enforceServiceReadyLocked();
-
             Slog.i(TAG, "Cancelling health check for package " + packageName);
-            mRemoteService.cancel(packageName);
+            try {
+                mRemoteService.cancel(packageName);
+            } catch (RemoteException e) {
+                // Do nothing, if the service is down, when it comes up, we will sync requests,
+                // if there's some other error, retrying wouldn't fix anyways.
+                Slog.w(TAG, "Failed to cancel health check for package " + packageName, e);
+            }
         }
     }
 
     /**
      * Returns the packages that we can request explicit health checks for.
      * The packages will be returned to the {@code consumer}.
-     *
-     * @throws IllegalStateException if the service is not started
      */
-    public void getSupportedPackages(Consumer<List<String>> consumer) throws RemoteException {
+    private void getSupportedPackages(Consumer<List<String>> consumer) {
         synchronized (mLock) {
-            if (!mEnabled) {
-                consumer.accept(Collections.emptyList());
+            if (!prepareServiceLocked("get health check supported packages")) {
                 return;
             }
 
-            enforceServiceReadyLocked();
-
-            if (mSupportedPackages == null) {
-                Slog.d(TAG, "Getting health check supported packages");
+            Slog.d(TAG, "Getting health check supported packages");
+            try {
                 mRemoteService.getSupportedPackages(new RemoteCallback(result -> {
-                    mSupportedPackages = result.getStringArrayList(EXTRA_SUPPORTED_PACKAGES);
-                    consumer.accept(mSupportedPackages);
+                    List<String> packages = result.getStringArrayList(EXTRA_SUPPORTED_PACKAGES);
+                    Slog.i(TAG, "Explicit health check supported packages " + packages);
+                    consumer.accept(packages);
                 }));
-            } else {
-                Slog.d(TAG, "Getting cached health check supported packages");
-                consumer.accept(mSupportedPackages);
+            } catch (RemoteException e) {
+                // Request failed, treat as if all observed packages are supported, if any packages
+                // expire during this period, we may incorrectly treat it as failing health checks
+                // even if we don't support health checks for the package.
+                Slog.w(TAG, "Failed to get health check supported packages", e);
             }
         }
     }
@@ -144,57 +254,42 @@
     /**
      * Returns the packages for which health checks are currently in progress.
      * The packages will be returned to the {@code consumer}.
-     *
-     * @throws IllegalStateException if the service is not started
      */
-    public void getRequestedPackages(Consumer<List<String>> consumer) throws RemoteException {
+    private void getRequestedPackages(Consumer<List<String>> consumer) {
         synchronized (mLock) {
-            if (!mEnabled) {
-                consumer.accept(Collections.emptyList());
+            if (!prepareServiceLocked("get health check requested packages")) {
                 return;
             }
 
-            enforceServiceReadyLocked();
-
             Slog.d(TAG, "Getting health check requested packages");
-            mRemoteService.getRequestedPackages(new RemoteCallback(
-                    result -> consumer.accept(
-                            result.getStringArrayList(EXTRA_REQUESTED_PACKAGES))));
-        }
-    }
-
-    /** Enables or disables explicit health checks. */
-    public void setEnabled(boolean enabled) {
-        synchronized (mLock) {
-            if (enabled == mEnabled) {
-                return;
-            }
-
-            Slog.i(TAG, "Setting explicit health checks enabled " + enabled);
-            mEnabled = enabled;
-            if (enabled) {
-                bindService();
-            } else {
-                unbindService();
+            try {
+                mRemoteService.getRequestedPackages(new RemoteCallback(result -> {
+                    List<String> packages = result.getStringArrayList(EXTRA_REQUESTED_PACKAGES);
+                    Slog.i(TAG, "Explicit health check requested packages " + packages);
+                    consumer.accept(packages);
+                }));
+            } catch (RemoteException e) {
+                // Request failed, treat as if we haven't requested any packages, if any packages
+                // were actually requested, they will not be cancelled now. May be cancelled later
+                Slog.w(TAG, "Failed to get health check requested packages", e);
             }
         }
     }
 
     /**
-     * Sets callbacks to listen to important events from the controller.
-     * Should be called at initialization.
+     * Binds to the explicit health check service if the controller is enabled and
+     * not already bound.
      */
-    public void setCallbacks(Runnable onConnected, Consumer<String> passedConsumer) {
-        Preconditions.checkNotNull(onConnected);
-        Preconditions.checkNotNull(passedConsumer);
-        mOnConnected = onConnected;
-        mPassedConsumer = passedConsumer;
-    }
-
-    /** Binds to the explicit health check service. */
     private void bindService() {
         synchronized (mLock) {
-            if (mRemoteService != null) {
+            if (!mEnabled || mConnection != null || mRemoteService != null) {
+                if (!mEnabled) {
+                    Slog.i(TAG, "Not binding to service, service disabled");
+                } else if (mRemoteService != null) {
+                    Slog.i(TAG, "Not binding to service, service already connected");
+                } else {
+                    Slog.i(TAG, "Not binding to service, service already connecting");
+                }
                 return;
             }
             ComponentName component = getServiceComponentNameLocked();
@@ -205,14 +300,11 @@
 
             Intent intent = new Intent();
             intent.setComponent(component);
-            // TODO: Fix potential race conditions during mConnection state transitions.
-            // E.g after #onServiceDisconected, the mRemoteService object is invalid until
-            // we get an #onServiceConnected.
             mConnection = new ServiceConnection() {
                 @Override
                 public void onServiceConnected(ComponentName name, IBinder service) {
-                    initState(service);
                     Slog.i(TAG, "Explicit health check service is connected " + name);
+                    initState(service);
                 }
 
                 @Override
@@ -221,19 +313,18 @@
                     // Service crashed or process was killed, #onServiceConnected will be called.
                     // Don't need to re-bind.
                     Slog.i(TAG, "Explicit health check service is disconnected " + name);
+                    synchronized (mLock) {
+                        mRemoteService = null;
+                    }
                 }
 
                 @Override
                 public void onBindingDied(ComponentName name) {
                     // Application hosting service probably got updated
                     // Need to re-bind.
-                    synchronized (mLock) {
-                        if (mEnabled) {
-                            unbindService();
-                            bindService();
-                        }
-                    }
-                    Slog.i(TAG, "Explicit health check service binding is dead " + name);
+                    Slog.i(TAG, "Explicit health check service binding is dead. Rebind: " + name);
+                    unbindService();
+                    bindService();
                 }
 
                 @Override
@@ -243,9 +334,9 @@
                 }
             };
 
-            Slog.i(TAG, "Binding to explicit health service");
-            mContext.bindServiceAsUser(intent, mConnection, Context.BIND_AUTO_CREATE,
-                    UserHandle.of(UserHandle.USER_SYSTEM));
+            mContext.bindServiceAsUser(intent, mConnection,
+                    Context.BIND_AUTO_CREATE, UserHandle.of(UserHandle.USER_SYSTEM));
+            Slog.i(TAG, "Explicit health check service is bound");
         }
     }
 
@@ -253,10 +344,11 @@
     private void unbindService() {
         synchronized (mLock) {
             if (mRemoteService != null) {
-                Slog.i(TAG, "Unbinding from explicit health service");
                 mContext.unbindService(mConnection);
                 mRemoteService = null;
+                mConnection = null;
             }
+            Slog.i(TAG, "Explicit health check service is unbound");
         }
     }
 
@@ -301,40 +393,54 @@
 
     private void initState(IBinder service) {
         synchronized (mLock) {
-            mSupportedPackages = null;
+            if (!mEnabled) {
+                Slog.w(TAG, "Attempting to connect disabled service?? Unbinding...");
+                // Very unlikely, but we disabled the service after binding but before we connected
+                unbindService();
+                return;
+            }
             mRemoteService = IExplicitHealthCheckService.Stub.asInterface(service);
             try {
                 mRemoteService.setCallback(new RemoteCallback(result -> {
                     String packageName = result.getString(EXTRA_HEALTH_CHECK_PASSED_PACKAGE);
                     if (!TextUtils.isEmpty(packageName)) {
-                        synchronized (mLock) {
-                            if (mPassedConsumer == null) {
-                                Slog.w(TAG, "Health check passed for package " + packageName
-                                        + "but no consumer registered.");
-                            } else {
-                                mPassedConsumer.accept(packageName);
-                            }
+                        if (mPassedConsumer == null) {
+                            Slog.wtf(TAG, "Health check passed for package " + packageName
+                                    + "but no consumer registered.");
+                        } else {
+                            // Call without lock held
+                            mPassedConsumer.accept(packageName);
                         }
                     } else {
-                        Slog.w(TAG, "Empty package passed explicit health check?");
+                        Slog.wtf(TAG, "Empty package passed explicit health check?");
                     }
                 }));
-                if (mOnConnected == null) {
-                    Slog.w(TAG, "Health check service connected but no runnable registered.");
-                } else {
-                    mOnConnected.run();
-                }
+                Slog.i(TAG, "Service initialized, syncing requests");
             } catch (RemoteException e) {
                 Slog.wtf(TAG, "Could not setCallback on explicit health check service");
             }
         }
+        // Calling outside lock
+        mNotifySyncRunnable.run();
     }
 
+    /**
+     * Prepares the health check service to receive requests.
+     *
+     * @return {@code true} if it is ready and we can proceed with a request,
+     * {@code false} otherwise. If it is not ready, and the service is enabled,
+     * we will bind and the request should be automatically attempted later.
+     */
     @GuardedBy("mLock")
-    private void enforceServiceReadyLocked() {
-        if (mRemoteService == null) {
-            // TODO: Try to bind to service
-            throw new IllegalStateException("Explicit health check service not ready");
+    private boolean prepareServiceLocked(String action) {
+        if (mRemoteService != null && mEnabled) {
+            return true;
         }
+        Slog.i(TAG, "Service not ready to " + action
+                + (mEnabled ? ". Binding..." : ". Disabled"));
+        if (mEnabled) {
+            bindService();
+        }
+        return false;
     }
 }