NNAPI Concurrent Query Management -- Implementation

The NNAPI requires requests on a model to be asynchronously
processed. This CL implements a basic Event that can later be used
to block the runtime thread until the asynchronous request has
completed.

The design document for NN API asynchronous behavior:
https://docs.google.com/a/google.com/document/d/1mO35KK3Mnr489ZftTDnKXXnXiYxk19jZ1C4DWOqaVB4/edit?usp=sharing

Bug: 63905942
Test: VtsHalNeuralnetworksV1_0TargetTest (32-bit, 64-bit) with sample driver enabled by cherry-pick
      frameworks/ml/nn/runtime/test with and without sample driver enabled
Change-Id: I97b1d4cbf189176fb3b21b2cc1af09dddaff18ab
diff --git a/nn/common/include/HalInterfaces.h b/nn/common/include/HalInterfaces.h
index 0da0efc..c6259ba 100644
--- a/nn/common/include/HalInterfaces.h
+++ b/nn/common/include/HalInterfaces.h
@@ -18,6 +18,7 @@
 #define ANDROID_ML_NN_COMMON_HAL_INTERFACES_H
 
 #include <android/hardware/neuralnetworks/1.0/IDevice.h>
+#include <android/hardware/neuralnetworks/1.0/IEvent.h>
 #include <android/hardware/neuralnetworks/1.0/IPreparedModel.h>
 #include <android/hardware/neuralnetworks/1.0/types.h>
 #include <android/hidl/allocator/1.0/IAllocator.h>
@@ -33,6 +34,7 @@
 using ::android::hardware::neuralnetworks::V1_0::DataLocation;
 using ::android::hardware::neuralnetworks::V1_0::DeviceStatus;
 using ::android::hardware::neuralnetworks::V1_0::IDevice;
+using ::android::hardware::neuralnetworks::V1_0::IEvent;
 using ::android::hardware::neuralnetworks::V1_0::IPreparedModel;
 using ::android::hardware::neuralnetworks::V1_0::InputOutputInfo;
 using ::android::hardware::neuralnetworks::V1_0::LocationValues;
@@ -44,6 +46,7 @@
 using ::android::hardware::neuralnetworks::V1_0::OperationType;
 using ::android::hardware::neuralnetworks::V1_0::PerformanceInfo;
 using ::android::hardware::neuralnetworks::V1_0::Request;
+using ::android::hardware::neuralnetworks::V1_0::Status;
 using ::android::hidl::allocator::V1_0::IAllocator;
 using ::android::hidl::memory::V1_0::IMemory;
 
diff --git a/nn/runtime/Android.bp b/nn/runtime/Android.bp
index 5321cd2..4fdc98d 100644
--- a/nn/runtime/Android.bp
+++ b/nn/runtime/Android.bp
@@ -33,6 +33,7 @@
     host_supported: false,
 
     srcs: [
+        "Event.cpp",
         "Manager.cpp",
         "Memory.cpp",
         "ModelBuilder.cpp",
diff --git a/nn/runtime/Event.cpp b/nn/runtime/Event.cpp
new file mode 100644
index 0000000..0fab86b
--- /dev/null
+++ b/nn/runtime/Event.cpp
@@ -0,0 +1,76 @@
+#include "Event.h"
+#include <android-base/logging.h>
+
+namespace android {
+namespace hardware {
+namespace neuralnetworks {
+namespace V1_0 {
+namespace implementation {
+
+Event::Event() : mStatus(Status::WAITING) {}
+
+Event::~Event() {
+    if (mThread.joinable()) {
+        mThread.join();
+    }
+}
+
+Return<void> Event::notify(ReturnedStatus status) {
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        mStatus = status == ReturnedStatus::SUCCESS ? Status::SUCCESS : Status::ERROR;
+        if (mStatus == Status::SUCCESS && mCallback != nullptr) {
+            bool success = mCallback();
+            if (!success) {
+                LOG(ERROR) << "Event::notify -- callback failed";
+            }
+        }
+    }
+    mCondition.notify_all();
+    return Void();
+}
+
+Event::Status Event::poll() {
+    std::lock_guard<std::mutex> lock(mMutex);
+    return mStatus;
+}
+
+Event::Status Event::wait() {
+    std::unique_lock<std::mutex> lock(mMutex);
+    mCondition.wait(lock, [this]{return mStatus != Status::WAITING;});
+    return mStatus;
+}
+
+bool Event::on_finish(std::function<bool(void)> callback) {
+    std::lock_guard<std::mutex> lock(mMutex);
+    if (mCallback != nullptr) {
+        LOG(ERROR) << "Event::on_finish -- a callback has already been bound to this event";
+        return false;
+    }
+    if (callback == nullptr) {
+        LOG(ERROR) << "Event::on_finish -- the new callback is invalid";
+        return false;
+    }
+    mCallback = std::move(callback);
+    return true;
+}
+
+bool Event::bind_thread(std::thread&& asyncThread) {
+    std::lock_guard<std::mutex> lock(mMutex);
+    if (mThread.joinable()) {
+        LOG(ERROR) << "Event::bind_thread -- a thread has already been bound to this event";
+        return false;
+    }
+    if (!asyncThread.joinable()) {
+        LOG(ERROR) << "Event::bind_thread -- the new thread is not joinable";
+        return false;
+    }
+    mThread = std::move(asyncThread);
+    return true;
+}
+
+}  // namespace implementation
+}  // namespace V1_0
+}  // namespace neuralnetworks
+}  // namespace hardware
+}  // namespace android
diff --git a/nn/runtime/Event.h b/nn/runtime/Event.h
new file mode 100644
index 0000000..2e19585
--- /dev/null
+++ b/nn/runtime/Event.h
@@ -0,0 +1,192 @@
+#ifndef ANDROID_HARDWARE_NEURALNETWORKS_V1_0_EVENT_H
+#define ANDROID_HARDWARE_NEURALNETWORKS_V1_0_EVENT_H
+
+#include <android/hardware/neuralnetworks/1.0/IEvent.h>
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <hidl/MQDescriptor.h>
+#include <hidl/Status.h>
+#include <mutex>
+#include <thread>
+
+namespace android {
+namespace hardware {
+namespace neuralnetworks {
+namespace V1_0 {
+namespace implementation {
+
+using ::android::hardware::hidl_array;
+using ::android::hardware::hidl_memory;
+using ::android::hardware::hidl_string;
+using ::android::hardware::hidl_vec;
+using ::android::hardware::Return;
+using ::android::hardware::Void;
+using ::android::sp;
+
+using ReturnedStatus = ::android::hardware::neuralnetworks::V1_0::Status;
+
+/**
+ * The Event class is used internally by the Neuralnetworks runtime to
+ * synchronize between different threads. An asynchronous task is launched
+ * paired with an event object. When a client thread requires the output being
+ * processed by the asynchronous task, the client thread can wait for the result
+ * and be blocked until it has completed or a timeout condition has been
+ * reached, or poll the result periodically. Both poll and wait* may safely be
+ * called concurrently, even on the same event. When the server thread has
+ * completed, it should immediately call "notify" to indicate the corresponding
+ * output has been produced and awaken any client threads waiting on the event.
+ *
+ * This class exists to enable synchronization across HIDL. When synchronization
+ * is only required in the same process, consider using std::future, std::mutex,
+ * std::condition_variable, or std::experimental::latch instead.
+ */
+struct Event : public IEvent {
+    Event();
+    ~Event() override;
+
+    /**
+     * Event::Status::WAITING -- The corresponding asynchronous execution has
+     *                           not yet finished.
+     * Event::Status::SUCCESS -- The corresponding asynchronous execution has
+     *                           succeeded and the output is ready to be
+     *                           consumed.
+     * Event::Status::TIMEOUT -- The calling thread has waited longer than the
+     *                           user has specified. This only applies to the
+     *                           methods Event::wait_for and Event::wait_until.
+     * Event::Status::ERROR   -- The corresponding asynchronous execution has
+     *                           failed to properly execute.
+     */
+    enum class Status : uint32_t {
+        WAITING,
+        SUCCESS,
+        TIMEOUT,
+        ERROR,
+    };
+
+    /**
+     * IEvent::notify marks the event with the return status of the
+     * asynchronous call the event is paired with and enables all
+     * prior and future wait calls on the Event object to proceed. The
+     * call to IEvent::notify happens before any wait* calls on
+     * this event return (except in the case of TIMEOUT) and before
+     * any poll calls that see the resulting status. The asynchronous
+     * call the event is paired with must ensure that any update to
+     * state that should be visible to the caller of wait* or poll
+     * happens before the call to IEvent::notify.
+     *
+     * IEvent::notify can be called at most once on a given event.
+     *
+     * @param neuralnetworks::V1_0::Status SUCCESS or ERROR
+     */
+    Return<void> notify(ReturnedStatus status) override;
+
+    /**
+     * Event::poll returns the current status of the event.
+     *
+     * @return Status SUCCESS, ERROR, or WAITING
+     */
+    Event::Status poll();
+
+    /**
+     * Event::wait blocks until the event has been signaled.
+     *
+     * @return Status SUCCESS or ERROR
+     */
+    Event::Status wait();
+
+    /**
+     * Event::wait_for blocks until the event has been signaled or the time
+     * duration from the time the wait_for function was called has expired,
+     * whichever comes first.
+     *
+     * @return Status SUCCESS, ERROR, or TIMEOUT
+     */
+    template<class Rep, class Period>
+    Event::Status wait_for(const std::chrono::duration<Rep,Period>& timeout_duration);
+
+    /**
+     * Event::wait_until blocks until the event has been signaled or a certain
+     * time has been reached, whichever comes first.
+     *
+     * @return Status SUCCESS, ERROR, or TIMEOUT
+     */
+    template<class Clock, class Duration>
+    Event::Status wait_until(const std::chrono::time_point<Clock,Duration>& timeout_duration);
+
+    /**
+     * Event::on_finish binds a callback function to the event. The
+     * callback will be executed when IEvent::notify is called, before
+     * any calls to wait* return. (Note that wait_for or wait_until
+     * can return TIMEOUT before IEvent::notify is called for the
+     * first time, and hence before the callback is executed.)
+     *
+     * The callback function must not synchronize with or otherwise
+     * access the event object it is bound to.
+     *
+     * Event::on_finish can be called at most once on a given event.
+     *
+     * @param callback Function to be invoked the first time IEvent::notify is
+     *                 called. Must have a target -- i.e., must not compare equal
+     *                 to nullptr. Callback returns true if it successfully
+     *                 completes, false if it fails.
+     * @return bool True if the callback was successfully bound, false if
+     *              unsuccessful.
+     *
+     * TODO: What if notify has already been called before on_finish?
+     * TODO: Why does the return value of the callback matter?
+     */
+     bool on_finish(std::function<bool(void)> callback);
+
+    /**
+     * Event::bind_thread binds a thread to the event ensuring that the thread
+     * has fully finished and cleaned its resources before the event is
+     * destroyed. The thread should be bound using std::move.
+     *
+     * The bound thread shall not call any Event method with the exception of
+     * IEvent::notify, which it will call when the thread has finished its
+     * computation.
+     *
+     * Event::bind_thread can be called at most once on a given event.
+     *
+     * @param asyncThread Thread to be bound to the event. The thread object
+     *                    must represent a thread of execution -- i.e.,
+     *                    asyncThread.joinable() must be true.
+     * @return bool True if successful, false if thread was not properly bound.
+     */
+     bool bind_thread(std::thread&& asyncThread);
+
+ private:
+    Status                    mStatus;
+    std::mutex                mMutex;
+    std::condition_variable   mCondition;
+    std::function<bool(void)> mCallback;
+    std::thread               mThread;
+};
+
+
+// template function implementations
+
+template<class Rep, class Period>
+Event::Status Event::wait_for(const std::chrono::duration<Rep,Period>& timeout_duration) {
+    std::unique_lock<std::mutex> lock(mMutex);
+    std::cv_status status = mCondition.wait_for(lock, timeout_duration,
+                                                [this]{return mStatus != Status::WAITING;});
+    return status != std::cv_status::timeout ? mStatus : Status::TIMEOUT;
+}
+
+template<class Clock, class Duration>
+Event::Status Event::wait_until(const std::chrono::time_point<Clock,Duration>& timeout_time) {
+    std::unique_lock<std::mutex> lock(mMutex);
+    std::cv_status status = mCondition.wait_until(lock, timeout_time,
+                                                  [this]{return mStatus != Status::WAITING;});
+    return status != std::cv_status::timeout ? mStatus : Status::TIMEOUT;
+}
+
+}  // namespace implementation
+}  // namespace V1_0
+}  // namespace neuralnetworks
+}  // namespace hardware
+}  // namespace android
+
+#endif  // ANDROID_HARDWARE_NEURALNETWORKS_V1_0_EVENT_H
diff --git a/nn/runtime/NeuralNetworks.cpp b/nn/runtime/NeuralNetworks.cpp
index b6afff9..705b219 100644
--- a/nn/runtime/NeuralNetworks.cpp
+++ b/nn/runtime/NeuralNetworks.cpp
@@ -20,6 +20,7 @@
 
 #define LOG_TAG "NeuralNetworks"
 
+#include "Event.h"
 #include "NeuralNetworks.h"
 #include "Manager.h"
 #include "Memory.h"
@@ -209,6 +210,7 @@
 static_assert(static_cast<uint32_t>(OperationType::TANH) == ANEURALNETWORKS_TANH,
               "OperationType::TANH != ANEURALNETWORKS_TANH");
 
+using android::sp;
 using namespace android::nn;
 
 // Validates the type. The used dimensions can be underspecified.
@@ -581,12 +583,20 @@
     // TODO validate the rest
 
     RequestBuilder* r = reinterpret_cast<RequestBuilder*>(request);
-    Event* e = nullptr;
-    int n = r->startCompute(&e);
+
+    // Dynamically allocate an sp to wrap an event. The sp<Event> object is
+    // returned when the request has been successfully launched, otherwise a
+    // nullptr is returned. The sp is used for ref-counting purposes. Without
+    // it, the HIDL service could attempt to communicate with a dead event
+    // object.
+    std::unique_ptr<sp<Event>> e = std::make_unique<sp<Event>>();
+    *event = nullptr;
+
+    int n = r->startCompute(e.get());
     if (n != ANEURALNETWORKS_NO_ERROR) {
         return n;
     }
-    *event = reinterpret_cast<ANeuralNetworksEvent*>(e);
+    *event = reinterpret_cast<ANeuralNetworksEvent*>(e.release());
     return ANEURALNETWORKS_NO_ERROR;
 }
 
@@ -596,13 +606,13 @@
         return ANEURALNETWORKS_UNEXPECTED_NULL;
     }
 
-    Event* e = reinterpret_cast<Event*>(event);
-    e->wait();
+    sp<Event>* e = reinterpret_cast<sp<Event>*>(event);
+    (*e)->wait();
     return ANEURALNETWORKS_NO_ERROR;
 }
 
 void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
     // No validation.  Free of nullptr is valid.
-    Event* e = reinterpret_cast<Event*>(event);
+    sp<Event>* e = reinterpret_cast<sp<Event>*>(event);
     delete e;
 }
diff --git a/nn/runtime/RequestBuilder.cpp b/nn/runtime/RequestBuilder.cpp
index 80f9303..fa9ad8e 100644
--- a/nn/runtime/RequestBuilder.cpp
+++ b/nn/runtime/RequestBuilder.cpp
@@ -23,6 +23,10 @@
 #include "Manager.h"
 #include "ModelBuilder.h"
 
+#include <thread>
+#include <mutex>
+#include <vector>
+
 namespace android {
 namespace nn {
 
@@ -126,7 +130,9 @@
     return ANEURALNETWORKS_NO_ERROR;
 }
 
-int RequestBuilder::startCompute(Event** event) {
+int RequestBuilder::startCompute(sp<Event>* event) {
+    *event = nullptr;
+
     // TODO validate that we have full types for all inputs and outputs,
     // that the graph is not cyclic,
     /*
@@ -151,7 +157,7 @@
     Model model;
     mModel->setHidlModel(&model);
 
-    return device == nullptr ? startComputeOnCpu(event, model)
+    return device == nullptr ? startComputeOnCpu(model, event)
                              : startComputeOnDevice(device->getInterface(), model, event);
 }
 
@@ -193,7 +199,10 @@
     }
 }
 
-int RequestBuilder::startComputeOnDevice(sp<IDevice> driver, const Model& model, Event** event) {
+int RequestBuilder::startComputeOnDevice(sp<IDevice> driver, const Model& model,
+                                         sp<Event>* event) {
+    *event = nullptr;
+
     LOG(DEBUG) << "RequestBuilder::startComputeOnDevice1";
     // TODO Dangerous!  In async, the model will outlive it here. Safe for now
     sp<IPreparedModel> preparedModel = driver->prepareModel(model);
@@ -231,15 +240,41 @@
         request.pools[i] = mMemories[i]->getHidlMemory();
     }
 
+    // Prepare the event for asynchronous execution. The sp<Event> object is
+    // returned when the request has been successfully launched, otherwise a
+    // nullptr is returned. The sp is used for ref-counting purposes. Without
+    // it, the HIDL service could attempt to communicate with a dead event
+    // object.
+    //
+    // TODO: Explain the "dead event" problem further, either here or
+    // in the design document.
+    sp<Event> eventSp = new Event();
+
     LOG(DEBUG) << "Before preparedModel->execute() " << toString(request);
     // Execute the request.
-    if (!preparedModel->execute(request)) {
+    // TODO: What happens to the Event if the service dies abnormally
+    // -- won't that keep the Event live forever, because the service
+    // never has the opportunity to bump the reference count down? Or
+    // maybe the HIDL infrastructure handles this magically? At worst,
+    // it seems like this is a small memory leak, if the Event stays
+    // alive forever.
+    if (!preparedModel->execute(request, eventSp)) {
         LOG(DEBUG) << "**Execute failed**";
         return ANEURALNETWORKS_OP_FAILED;
     }
 
+    // TODO: Remove this synchronization point when the block of code below is
+    // removed.
+    Event::Status status = eventSp->wait();
+    if (status != Event::Status::SUCCESS) {
+        LOG(DEBUG) << "**Execute async failed**";
+        return ANEURALNETWORKS_OP_FAILED;
+    }
+
     // Copy the output data from shared memory to the output buffers.
-    // TODO: outputMemory->update();
+    // TODO: Move this block of code somewhere else. It should not be in the
+    // startCompute function.
+    // TODO: outputMemory->update(); outputMemory->commit()
     for (auto& info : mOutputs) {
         if (info.state == ModelArgumentInfo::POINTER) {
             DataLocation& loc = info.locationAndDimension.location;
@@ -249,14 +284,27 @@
     }
     LOG(DEBUG) << "RequestBuilder::startComputeOnDevice completed";
 
-    *event = new Event(); // TODO pass ievent
+    *event = eventSp;
     return ANEURALNETWORKS_NO_ERROR;
 }
 
-int RequestBuilder::startComputeOnCpu(Event** event, [[maybe_unused]] const Model& model) {
+static void asyncStartComputeOnCpu(const Model& model, const Request& request,
+                                   const std::vector<RunTimePoolInfo>& runTimePoolInfos,
+                                   const sp<IEvent>& event) {
+    CpuExecutor executor;
+    int err = executor.run(model, request, runTimePoolInfos);
+    Status executionStatus = err == ANEURALNETWORKS_NO_ERROR ? Status::SUCCESS : Status::ERROR;
+    event->notify(executionStatus);
+}
+
+int RequestBuilder::startComputeOnCpu([[maybe_unused]] const Model& model, sp<Event>* event) {
     // TODO: use a thread pool
-    Event* e = new Event();
-    *event = e;
+
+    // Prepare the event for asynchronous execution. The sp<Event> object is
+    // returned when the request has been successfully launched, otherwise a
+    // nullptr is returned.
+    sp<Event> eventSp = new Event();
+    *event = nullptr;
 
     std::vector<RunTimePoolInfo> runTimePoolInfos;
     uint32_t count = mMemories.size();
@@ -284,8 +332,13 @@
     copyLocationAndDimension(mInputs, &request.inputs);
     copyLocationAndDimension(mOutputs, &request.outputs);
 
-    CpuExecutor executor;
-    return executor.run(model, request, runTimePoolInfos);
+    // TODO: should model be moved with a std::cref?
+    std::thread thread(asyncStartComputeOnCpu, model, std::move(request),
+                       std::move(runTimePoolInfos), eventSp);
+    eventSp->bind_thread(std::move(thread));
+
+    *event = eventSp;
+    return ANEURALNETWORKS_NO_ERROR;
 }
 
 } // namespace nn
diff --git a/nn/runtime/RequestBuilder.h b/nn/runtime/RequestBuilder.h
index 80c3e1c..00a0273 100644
--- a/nn/runtime/RequestBuilder.h
+++ b/nn/runtime/RequestBuilder.h
@@ -17,6 +17,7 @@
 #ifndef ANDROID_ML_NN_RUNTIME_REQUEST_BUILDER_H
 #define ANDROID_ML_NN_RUNTIME_REQUEST_BUILDER_H
 
+#include "Event.h"
 #include "HalInterfaces.h"
 #include "Memory.h"
 #include "NeuralNetworks.h"
@@ -24,18 +25,14 @@
 #include <unordered_map>
 #include <vector>
 
+using ::android::hardware::neuralnetworks::V1_0::implementation::Event;
+
 namespace android {
 namespace nn {
 
 class Memory;
 class ModelBuilder;
 
-// TODO
-class Event {
-public:
-    void wait() {}
-};
-
 // TODO move length out of DataLocation
 struct ModelArgumentInfo {
     // Whether the arguement was specified as being in a Memory, as a pointer,
@@ -66,14 +63,14 @@
                   uint32_t length);
     int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
                             const Memory* memory, uint32_t offset, uint32_t length);
-    int startCompute(Event** event);
+    int startCompute(sp<Event>* event);
 
 private:
     int allocatePointerArgumentsToPool(std::vector<ModelArgumentInfo>* args, Memory* memory);
     int updateDimensionInfo(ModelArgumentInfo* info, const ANeuralNetworksOperandType* newType,
                             uint32_t operandIndex);
-    int startComputeOnDevice(sp<IDevice> driver, const Model& model, Event** event);
-    int startComputeOnCpu(Event** event, const Model& model);
+    int startComputeOnDevice(sp<IDevice> driver, const Model& model, sp<Event>* event);
+    int startComputeOnCpu(const Model& model, sp<Event>* event);
 
     const ModelBuilder* mModel;
     // Whether the application prefers to go fast or use low power for this request.
diff --git a/nn/runtime/include/NeuralNetworks.h b/nn/runtime/include/NeuralNetworks.h
index 00c78e8..857dad4 100644
--- a/nn/runtime/include/NeuralNetworks.h
+++ b/nn/runtime/include/NeuralNetworks.h
@@ -631,17 +631,17 @@
                                                uint32_t length);
 
 /**
- * Queue the request for execution.
+ * Schedule the request for execution.
  *
- * <p>Puts the request in a queue for execution. Once the model has been
+ * <p>Schedules the request for execution. Once the model has been
  * applied and the outputs are ready to be consumed, the returned event will be
  * signaled. Use {@link ANeuralNetworksRequest_wait} to wait for that event.
  * </p>
  *
- * Multiple requests can be queued and executed concurrently. The runtime makes
+ * Multiple requests can be scheduled and executed concurrently. The runtime makes
  * no guarantee on the ordering of the completion of the requests.  If it's
- * important to the application, the application should enforces the ordering by
- * using the return events.
+ * important to the application, the application should enforce the ordering by
+ * using the returned events.
  *
  * ANeuralNetworksRequest_wait must be called to recuperate the resources used
  * by the event.
@@ -650,7 +650,8 @@
  *
  * @param request The request to be modified.
  * @param event The event that will be signaled on completion.
- *              [TODO define the functions to create/delete events]
+ *              [TODO define the functions to create/delete events?
+ *                    or startCompute creates, and free deletes?]
  *
  * @return NO_ERROR if successful, BAD_DATA if callback is NULL.
  */
diff --git a/nn/sample_driver/SampleDriver.cpp b/nn/sample_driver/SampleDriver.cpp
index 769b67c..600ffad 100644
--- a/nn/sample_driver/SampleDriver.cpp
+++ b/nn/sample_driver/SampleDriver.cpp
@@ -17,13 +17,18 @@
 #define LOG_TAG "SampleDriver"
 
 #include "SampleDriver.h"
+
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include <thread>
+
 namespace android {
 namespace nn {
 namespace sample_driver {
 
+SampleDriver::~SampleDriver() {}
+
 Return<void> SampleDriver::initialize(initialize_cb cb) {
     SetMinimumLogSeverity(base::VERBOSE);
     LOG(DEBUG) << "SampleDriver::initialize()";
@@ -158,6 +163,8 @@
     mModel = model;
 }
 
+SamplePreparedModel::~SamplePreparedModel() {}
+
 static bool mapPools(std::vector<RunTimePoolInfo>* poolInfos, const hidl_vec<hidl_memory>& pools) {
     poolInfos->resize(pools.size());
     for (size_t i = 0; i < pools.size(); i++) {
@@ -169,18 +176,36 @@
     return true;
 }
 
-Return<bool> SamplePreparedModel::execute(const Request& request) {
-    LOG(DEBUG) << "SampleDriver::prepareRequest(" << toString(request) << ")";
+void SamplePreparedModel::asyncExecute(const Request& request, const sp<IEvent>& event) {
+    if (event.get() == nullptr) {
+        LOG(ERROR) << "asyncExecute: invalid event";
+        return;
+    }
 
     std::vector<RunTimePoolInfo> poolInfo;
     if (!mapPools(&poolInfo, request.pools)) {
-        return false;
+        event->notify(Status::ERROR);
+        return;
     }
 
     CpuExecutor executor;
     int n = executor.run(mModel, request, poolInfo);
     LOG(DEBUG) << "executor.run returned " << n;
-    return n == ANEURALNETWORKS_NO_ERROR;
+    Status executionStatus = n == ANEURALNETWORKS_NO_ERROR ? Status::SUCCESS : Status::ERROR;
+    Return<void> returned = event->notify(executionStatus);
+    if (!returned.isOk()) {
+        LOG(ERROR) << "hidl callback failed to return properly: " << returned.description();
+    }
+}
+
+Return<bool> SamplePreparedModel::execute(const Request& request, const sp<IEvent>& event) {
+    LOG(DEBUG) << "SampleDriver::execute(" << toString(request) << ")";
+
+    // This thread is intentionally detached because the sample driver service
+    // is expected to live forever.
+    std::thread([this, request, event]{ asyncExecute(request, event); }).detach();
+
+    return true;
 }
 
 } // namespace sample_driver
diff --git a/nn/sample_driver/SampleDriver.h b/nn/sample_driver/SampleDriver.h
index 280b3fe..b03da9e 100644
--- a/nn/sample_driver/SampleDriver.h
+++ b/nn/sample_driver/SampleDriver.h
@@ -29,20 +29,22 @@
 // An actual driver would not do that.
 class SampleDriver : public IDevice {
 public:
-    virtual ~SampleDriver() {}
-    virtual Return<void> initialize(initialize_cb _hidl_cb);
-    virtual Return<void> getSupportedSubgraph(const Model& model, getSupportedSubgraph_cb _hidl_cb);
-    virtual Return<sp<IPreparedModel>> prepareModel(const Model& model);
-    virtual Return<DeviceStatus> getStatus();
+    ~SampleDriver() override;
+    Return<void> initialize(initialize_cb _hidl_cb) override;
+    Return<void> getSupportedSubgraph(const Model& model,
+                                      getSupportedSubgraph_cb _hidl_cb) override;
+    Return<sp<IPreparedModel>> prepareModel(const Model& model) override;
+    Return<DeviceStatus> getStatus() override;
 };
 
 class SamplePreparedModel : public IPreparedModel {
 public:
     SamplePreparedModel(const Model& model);
-    virtual ~SamplePreparedModel() {}
-    virtual Return<bool> execute(const Request& request);
+    ~SamplePreparedModel() override;
+    Return<bool> execute(const Request& request, const sp<IEvent>& event) override;
 
 private:
+    void asyncExecute(const Request& request, const sp<IEvent>& event);
     Model mModel;
 };