| /* |
| * Copyright (C) 2017 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #define LOG_TAG "ExecutionBuilder" |
| |
| #include "ExecutionBuilder.h" |
| |
| #include <android/sync.h> |
| |
| #include <algorithm> |
| #include <limits> |
| #include <memory> |
| #include <mutex> |
| #include <optional> |
| #include <string> |
| #include <thread> |
| #include <tuple> |
| #include <utility> |
| #include <vector> |
| |
| #include "CompilationBuilder.h" |
| #include "ControlFlow.h" |
| #include "CpuExecutor.h" |
| #include "ExecutionBurstController.h" |
| #include "HalInterfaces.h" |
| #include "Manager.h" |
| #include "ModelArgumentInfo.h" |
| #include "ModelBuilder.h" |
| #include "Tracing.h" |
| #include "TypeManager.h" |
| #include "Utils.h" |
| |
| namespace android { |
| namespace nn { |
| |
| using namespace hal; |
| |
| const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX}; |
| |
| static MeasureTiming measureTiming(const ExecutionBuilder* execution) { |
| return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO; |
| } |
| |
| static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType, |
| const char* tag, bool allowUnspecified) { |
| if (newType != nullptr) { |
| const Extension::OperandTypeInformation* info = nullptr; |
| if (isExtensionOperandType(operand.type)) { |
| NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info)); |
| } |
| if (validateOperandType(*newType, info, tag, allowUnspecified) != |
| ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << tag << ": Invalid newType"; |
| return false; |
| } |
| if (operand.dimensions.size() == 0) { |
| return true; |
| } |
| if (operand.dimensions.size() != newType->dimensionCount) { |
| LOG(ERROR) << tag << ": Setting with incompatible dimension count"; |
| return false; |
| } |
| for (uint32_t i = 0; i < newType->dimensionCount; i++) { |
| if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) { |
| LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed"; |
| return false; |
| } |
| } |
| } else { |
| if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) && |
| tensorHasUnspecifiedDimensions(operand)) { |
| LOG(ERROR) << tag << ": Setting with operand type that is not fully specified"; |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation) |
| : mCompilation(compilation), |
| mModel(compilation->mModel), |
| mPlan(&compilation->mPlan), |
| mPartitioning(compilation->mPartitioning), |
| mInputs(mModel->inputCount()), |
| mOutputs(mModel->outputCount()) { |
| VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size() |
| << " inputs and " << mOutputs.size() << " outputs"; |
| } |
| |
| const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const { |
| return mPlan->getSourceModels().getModel(index); |
| } |
| |
| int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type, |
| const void* buffer, size_t length) { |
| if (mStarted) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mInputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!checkDimensionInfo(mModel->getInputOperand(index), type, |
| "ANeuralNetworksExecution_setInput", buffer == nullptr)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (length > 0xFFFFFFFF) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| uint32_t l = static_cast<uint32_t>(length); |
| if (!mInputs[index].unspecified()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been " |
| "provided"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| int n; |
| std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer( |
| mModel->getInputOperand(index), type, const_cast<void*>(buffer), l); |
| return n; |
| } |
| |
| int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, |
| const Memory* memory, size_t offset, size_t length) { |
| // Should be similar to StepExecutor::setInputOrOutputFromMemory() |
| |
| if (mStarted) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mInputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " " |
| << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!checkDimensionInfo(mModel->getInputOperand(index), type, |
| "ANeuralNetworksExecution_setInputFromMemory", false)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset, |
| length)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we |
| // allow the client to specify offset == 0 && length == 0 indicating that the entire memory |
| // region is used. We update the length here because the drivers are still expecting a real |
| // length. For other memories that do not allow this semantic, it is checked in |
| // MemoryValidatorBase::validate before reaching here. |
| if (memory->getHidlMemory().valid() && offset == 0 && length == 0) { |
| length = memory->getHidlMemory().size(); |
| } |
| // TODO validate the rest |
| uint32_t poolIndex = mMemories.add(memory); |
| if (!mInputs[index].unspecified()) { |
| LOG(ERROR) |
| << "ANeuralNetworksExecution_setInputFromMemory called when an input has already " |
| "been provided"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| int n; |
| std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromMemory( |
| mModel->getInputOperand(index), type, poolIndex, offset, length); |
| return n; |
| } |
| |
| int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type, |
| void* buffer, size_t length) { |
| if (mStarted) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mOutputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!checkDimensionInfo(mModel->getOutputOperand(index), type, |
| "ANeuralNetworksExecution_setOutput", true)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (length > 0xFFFFFFFF) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| uint32_t l = static_cast<uint32_t>(length); |
| if (!mOutputs[index].unspecified()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been " |
| "provided"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| int n; |
| std::tie(n, mOutputs[index]) = |
| ModelArgumentInfo::createFromPointer(mModel->getOutputOperand(index), type, buffer, l); |
| return n; |
| } |
| |
| int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, |
| const Memory* memory, size_t offset, size_t length) { |
| // Should be similar to StepExecutor::setInputOrOutputFromMemory() |
| |
| if (mStarted) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mOutputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " " |
| << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!checkDimensionInfo(mModel->getOutputOperand(index), type, |
| "ANeuralNetworksExecution_setOutputFromMemory", true)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset, |
| length)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we |
| // allow the client to specify offset == 0 && length == 0 indicating that the entire memory |
| // region is used. We update the length here because the drivers are still expecting a real |
| // length. For other memories that do not allow this semantic, it is checked in |
| // MemoryValidatorBase::validate before reaching here. |
| if (memory->getHidlMemory().valid() && offset == 0 && length == 0) { |
| length = memory->getHidlMemory().size(); |
| } |
| // TODO validate the rest |
| uint32_t poolIndex = mMemories.add(memory); |
| if (!mOutputs[index].unspecified()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has " |
| "already been provided"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| int n; |
| std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromMemory( |
| mModel->getOutputOperand(index), type, poolIndex, offset, length); |
| return n; |
| } |
| |
| int ExecutionBuilder::setMeasureTiming(bool measure) { |
| if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on " |
| << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation " |
| << "that was not created by ANeuralNetworksCompilation_createForDevices " |
| << "with numDevices = 1"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (mStarted) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| mMeasureTiming = measure; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const { |
| if (!mFinished && !hasSyncFence()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the " |
| "execution has finished."; |
| *duration = UINT64_MAX; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| // If the sync fence is valid, perform a non-blocking status check on the sync fence status. |
| // TODO(b/148423931): consider using a utility method to wait on the sync fence |
| // and distinguish the not-finished status and error state. |
| if (hasSyncFence() && sync_wait(mSyncFenceFd, 0) < 0) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the " |
| "execution has finished, or the execution has encountered an error."; |
| *duration = UINT64_MAX; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| |
| // NOTE: At the HAL level, timing is in microseconds. At the NDK level, nanoseconds. |
| const uint64_t kNanoPerMicro = 1000; |
| |
| if (!mMeasureTiming) { |
| *duration = UINT64_MAX; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| |
| // Timing might be reported through other compute method. |
| // Only query the fenced callback if it is available, and we are not |
| // updating mTiming to keep this method const. |
| Timing timingLaunched = mTiming; |
| Timing timingFenced = kNoTiming; |
| if (mFencedExecutionCallback != nullptr) { |
| ErrorStatus status; |
| const Return<void> ret = mFencedExecutionCallback->getExecutionInfo( |
| [&status, &timingLaunched, &timingFenced](ErrorStatus error, Timing tLaunched, |
| Timing tFenced) { |
| status = error; |
| timingLaunched = tLaunched; |
| timingFenced = tFenced; |
| }); |
| if (!ret.isOk()) { |
| *duration = UINT64_MAX; |
| return ANEURALNETWORKS_OP_FAILED; |
| } |
| if (status != ErrorStatus::NONE) { |
| *duration = UINT64_MAX; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| } |
| // timingFenced should be the same as timingLaunched for compute methods other than fenced |
| // compute. |
| if (timingFenced == kNoTiming) { |
| timingFenced = timingLaunched; |
| } |
| uint64_t microDuration = UINT64_MAX; |
| switch (durationCode) { |
| case ANEURALNETWORKS_DURATION_ON_HARDWARE: |
| microDuration = timingLaunched.timeOnDevice; |
| break; |
| case ANEURALNETWORKS_DURATION_IN_DRIVER: |
| microDuration = timingLaunched.timeInDriver; |
| break; |
| case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE: |
| microDuration = timingFenced.timeOnDevice; |
| break; |
| case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER: |
| microDuration = timingFenced.timeInDriver; |
| break; |
| default: |
| CHECK(!"unexpected"); |
| } |
| *duration = (microDuration == UINT64_MAX) ? UINT64_MAX : kNanoPerMicro * microDuration; |
| |
| VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::setTimeoutDuration(uint64_t duration) { |
| if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution " |
| "created from an ANeuralNetworksCompilation that was not created by " |
| "ANeuralNetworksCompilation_createForDevices with numDevices = 1"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (mStarted) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (duration > 0) { |
| mTimeoutDuration = duration; |
| } else { |
| mTimeoutDuration.reset(); |
| } |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const { |
| return mTimeoutDuration; |
| } |
| |
| int ExecutionBuilder::setLoopTimeout(uint64_t duration) { |
| if (mStarted) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (duration > operation_while::kTimeoutNsMaximum) { |
| LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed " |
| << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum; |
| duration = operation_while::kTimeoutNsMaximum; |
| } |
| mLoopTimeoutDuration = duration; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) { |
| if (!mFinished && !hasSyncFence()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the " |
| "execution has finished."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| // If the sync fence is valid, perform a non-blocking status check on the sync fence status. |
| // TODO(b/148423931): consider using a utility method to wait on the sync fence |
| // and distinguish the not-finished status and error state. |
| if (hasSyncFence() && sync_wait(mSyncFenceFd, 0) < 0) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the " |
| "execution has finished, or the execution has encountered an error."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| |
| uint32_t count = static_cast<uint32_t>(mOutputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index |
| << " " << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| const auto& dims = mOutputs[index].dimensions(); |
| if (dims.empty()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query " |
| "dimensions of a scalar"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| std::copy(dims.begin(), dims.end(), dimensions); |
| return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR |
| : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE; |
| } |
| |
| int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) { |
| if (!mFinished && !hasSyncFence()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the " |
| "execution has finished."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| // If the sync fence is valid, perform a non-blocking status check on the sync fence status. |
| // TODO(b/148423931): consider using a utility method to wait on the sync fence |
| // and distinguish the not-finished status and error state. |
| if (hasSyncFence() && sync_wait(mSyncFenceFd, 0) < 0) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the " |
| "execution has finished, or the execution has encountered an error."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mOutputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " " |
| << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| *rank = static_cast<uint32_t>(mOutputs[index].dimensions().size()); |
| return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR |
| : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE; |
| } |
| |
| // Attempt synchronous execution of full model on CPU. |
| // TODO: How should we handle timing in this case? |
| // For Q this is irrelevant: We only support timing in conjunction |
| // with an explicit device list; and we do not support CPU fallback |
| // with an explicit device list. See CompilationBuilder::mExplicitDeviceList. |
| static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull( |
| ExecutionBuilder* executionBuilder) { |
| CHECK(executionBuilder != nullptr); |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull"); |
| VLOG(EXECUTION) << "cpuFallbackFull"; |
| |
| // Get fallback executor. |
| StepExecutor executor(executionBuilder, executionBuilder->getModel(), |
| DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr); |
| executor.mapInputsAndOutputsTrivially(); |
| |
| // Attempt fallback execution. |
| return executor.computeOnCpuFallback(); |
| } |
| |
| // Attempt synchronous execution on CPU. |
| // TODO: How should we handle timing in this case? |
| // For Q this is irrelevant: We only support timing in conjunction |
| // with an explicit device list; and we do not support CPU fallback |
| // with an explicit device list. See CompilationBuilder::mExplicitDeviceList. |
| static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>> |
| cpuFallbackPartial(const ExecutionPlan& plan, |
| std::shared_ptr<ExecutionPlan::Controller> controller) { |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial"); |
| VLOG(EXECUTION) << "cpuFallbackPartial"; |
| |
| // Get fallback executor. |
| std::shared_ptr<StepExecutor> executor; |
| int n1 = plan.fallback(controller, &executor); |
| if (n1 != ANEURALNETWORKS_NO_ERROR) { |
| return {n1, {}, kNoTiming, nullptr}; |
| } |
| CHECK(executor != nullptr); |
| |
| // Attempt fallback execution. |
| auto [n2, outputShapes, timing] = executor->computeOnCpuFallback(); |
| return {n2, std::move(outputShapes), timing, executor}; |
| } |
| |
| static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder, |
| const ExecutionPlan& plan, |
| std::shared_ptr<ExecutionPlan::Controller> controller, |
| bool allowFallback, |
| const std::optional<Deadline>& deadline, |
| const sp<ExecutionCallback>& executionCallback) { |
| CHECK(executionBuilder != nullptr); |
| VLOG(EXECUTION) << "ExecutionBuilder::compute (from plan, iteratively)"; |
| |
| std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes(); |
| Timing timing = kNoTiming; |
| // Disallow fallback when the ExecutionPlan is simple on CPU. |
| allowFallback &= !plan.isSimpleCpu(); |
| |
| while (true) { |
| VLOG(EXECUTION) << "looking for next StepExecutor"; |
| |
| // Get the current step of the execution. |
| std::shared_ptr<StepExecutor> executor; |
| std::shared_ptr<ExecutionBurstController> burstController; |
| int n = plan.next(controller, &executor, &burstController); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| // During the interpreted execution of control flow, a loop timeout |
| // might occur in ExecutionPlan::next(). |
| bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT || |
| n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT; |
| if (allowFallback && !missedDeadline) break; |
| executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming); |
| return; |
| } |
| |
| // If the code reached the end of the plan without error, then return |
| // with no error. |
| if (executor == nullptr) { |
| executionCallback->notify(ErrorStatus::NONE, outputShapes, timing); |
| return; |
| } |
| const bool executorIsCpu = executor->isCpu(); |
| |
| // Attempt to execute a single step of the execution. |
| auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController); |
| |
| // Update global outputs. |
| if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) { |
| stepN = ANEURALNETWORKS_OP_FAILED; |
| } |
| |
| // If execution was successful, continue to next step. |
| if (stepN == ANEURALNETWORKS_NO_ERROR) { |
| // We only support collection of timing information in the case of a |
| // single step, so it's safe to just keep track of the last step's |
| // timing information. |
| timing = stepTiming; |
| continue; |
| } |
| |
| // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution. |
| if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) { |
| const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN); |
| executionCallback->notify(stepStatus, outputShapes, kNoTiming); |
| return; |
| } |
| |
| // If fallback is not allowed and there was an error, end execution. |
| if (!allowFallback) { |
| const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN); |
| executionCallback->notify(stepStatus, {}, kNoTiming); |
| return; |
| } |
| |
| // If CPU execution was already attempted, either: |
| // (1) perform a full fallback if the plan is not simple, or |
| // (2) return from the function with an error |
| if (executorIsCpu) { |
| if (!plan.isSimple()) break; |
| executionCallback->notify(convertResultCodeToErrorStatus(stepN), {}, kNoTiming); |
| return; |
| } |
| |
| // If the code reaches this point, attempt a partial fallback to CPU. |
| CHECK(allowFallback); |
| auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] = |
| cpuFallbackPartial(plan, controller); |
| |
| // Update global outputs. |
| if (fallbackExecutor != nullptr && |
| !fallbackExecutor->updateOutputShapes(fallbackOutputShapes, &outputShapes)) { |
| fallbackN = ANEURALNETWORKS_OP_FAILED; |
| } |
| |
| // If execution was successful, continue to next step. |
| if (fallbackN == ANEURALNETWORKS_NO_ERROR) { |
| // We only support collection of timing information in the case of a |
| // single step, so it's safe to just keep track of the last step's |
| // timing information. |
| timing = fallbackTiming; |
| continue; |
| } |
| |
| // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution. |
| if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) { |
| const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN); |
| executionCallback->notify(fallbackStatus, outputShapes, kNoTiming); |
| return; |
| } |
| |
| // Do not fallback twice if the ExecutionPlan is simple. |
| if (plan.isSimple()) { |
| const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN); |
| executionCallback->notify(fallbackStatus, {}, kNoTiming); |
| return; |
| } |
| |
| // If the code reaches this point, then there was an error with the |
| // fallback. In this case, attempt full fallback. |
| break; |
| } |
| |
| // If the code has reached this point, a potentially recoverable error |
| // occurred during the step executions. Instead, do a full execution |
| // fallback on the CPU. |
| auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder); |
| const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN); |
| executionCallback->notify(fullStatus, fullOutputShapes, fullTiming); |
| } |
| |
| // In case of partitioned execution, startComputeFenced call will return the sync |
| // fence and the fenced compute callback returned from the last partition. |
| // Any failed partition will result in the whole execution fallback to CPU if |
| // allowFallback is set to true. |
| static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFenced( |
| ExecutionBuilder* executionBuilder, const ExecutionPlan& plan, |
| std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor, |
| uint64_t timeoutDurationAfterFence, const std::optional<Deadline>& deadline, |
| bool allowFallback) { |
| CHECK(executionBuilder != nullptr); |
| VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)"; |
| // Disallow fallback when the ExecutionPlan is simple on CPU. |
| allowFallback &= !plan.isSimpleCpu(); |
| |
| // Initiate waitForFds, syncFence for the first step. |
| std::vector<int> waitForFds = waitFor; |
| int syncFence = -1; |
| sp<hal::IFencedExecutionCallback> computeFencedCallback; |
| |
| while (true) { |
| VLOG(EXECUTION) << "looking for next StepExecutor"; |
| |
| // Get the current step of the execution. |
| std::shared_ptr<StepExecutor> executor; |
| int n = plan.next(controller, &executor, nullptr, syncFence); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| // During the interpreted execution of control flow, a loop timeout |
| // might occur in ExecutionPlan::next(). |
| bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT || |
| n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT; |
| if (allowFallback && !missedDeadline) break; |
| // Return -1 for the sync fence fd, and nullptr for the callback. |
| return std::make_tuple(n, -1, nullptr); |
| } |
| |
| // If the code reached the end of the plan without error, then return |
| // with no error. |
| if (executor == nullptr) { |
| // If the final step returns a -1 for sync fence, the execution is finished. |
| // Update the output shapes. |
| if (syncFence == -1) { |
| // TODO(miaowang): support dynamic output shape only with memory domain. |
| // For now just return the initial output shapes. |
| executionBuilder->finish(ErrorStatus::NONE, |
| executionBuilder->getInitialOutputShapes()); |
| } |
| return std::make_tuple(ANEURALNETWORKS_NO_ERROR, syncFence, computeFencedCallback); |
| } |
| const bool executorIsCpu = executor->isCpu(); |
| |
| // Attempt to execute a single step of the execution. |
| auto [stepN, syncFd, callback] = |
| executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline); |
| |
| // Update waitForFds, syncFence for the next step. |
| syncFence = syncFd; |
| computeFencedCallback = callback; |
| waitForFds.clear(); |
| if (syncFd > 0) { |
| waitForFds = {syncFd}; |
| } |
| |
| // If execution was successful, continue to next step. |
| if (stepN == ANEURALNETWORKS_NO_ERROR) { |
| continue; |
| } |
| // If fallback is not allowed and there was an error, end execution. |
| if (!allowFallback) { |
| return std::make_tuple(stepN, -1, nullptr); |
| } |
| |
| // If CPU execution was already attempted, either: |
| // (1) perform a full fallback if the plan is not simple, or |
| // (2) return from the function with an error |
| if (executorIsCpu) { |
| if (!plan.isSimple()) break; |
| return std::make_tuple(stepN, -1, nullptr); |
| } |
| // If the code reaches this point, then there was an error with the |
| // fallback. In this case, attempt full fallback. |
| break; |
| } |
| |
| // If the code has reached this point, a potentially recoverable error |
| // occurred during the step executions. Instead, do a full execution |
| // fallback on the CPU. |
| VLOG(EXECUTION) << "Performing full fallback on the CPU."; |
| for (int syncFd : waitFor) { |
| if (syncFd > 0) { |
| int r = sync_wait(syncFd, -1); |
| if (r < 0) { |
| VLOG(EXECUTION) << "sync_wait failed, fd: " << syncFd; |
| return std::make_tuple(ANEURALNETWORKS_OP_FAILED, -1, nullptr); |
| } |
| } |
| } |
| auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder); |
| const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN); |
| syncFence = -1; |
| executionBuilder->finish(fullStatus, fullOutputShapes); |
| executionBuilder->reportTiming(fullTiming); |
| return std::make_tuple(fullN, syncFence, nullptr); |
| } |
| |
| int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor, |
| uint64_t timeoutDurationAfterFence, int* syncFence) { |
| CHECK(syncFence != nullptr); |
| if (mStarted) { |
| LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies" |
| " called on an execution that has already started"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (timeoutDurationAfterFence > 0) { |
| if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { |
| LOG(ERROR) |
| << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero " |
| "duration on an ANeuralNetworksExecution " |
| "created from an ANeuralNetworksCompilation that was not created by " |
| "ANeuralNetworksCompilation_createForDevices with numDevices = 1"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| } |
| const auto deadline = makeDeadline(mTimeoutDuration); |
| for (auto& p : mInputs) { |
| if (p.state() == ModelArgumentInfo::UNSPECIFIED) { |
| LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies" |
| " not all inputs specified"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| } |
| for (auto& p : mOutputs) { |
| if (p.state() == ModelArgumentInfo::UNSPECIFIED) { |
| LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies" |
| " not all outputs specified"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| } |
| for (uint32_t i = 0; i < mOutputs.size(); i++) { |
| if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE && |
| !checkDimensionInfo(mModel->getOutputOperand(i), nullptr, |
| "ANeuralNetworksExecution_startComputeWithDependencies", false)) { |
| LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies" |
| " not all outputs have fully specified dimensions"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| } |
| mStarted = true; |
| const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning); |
| std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr); |
| VLOG(EXECUTION) << "ExecutionBuilder::computeFenced"; |
| int result; |
| std::tie(result, mSyncFenceFd, mFencedExecutionCallback) = startComputeFenced( |
| this, *mPlan, controller, waitFor, timeoutDurationAfterFence, deadline, allowFallback); |
| *syncFence = mSyncFenceFd; |
| return result; |
| } |
| |
| int ExecutionBuilder::compute(sp<ExecutionCallback>* synchronizationCallback, |
| BurstBuilder* burstBuilder) { |
| CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr) |
| << "synchronizationCallback and burstBuilder cannot simultaneously be used"; |
| |
| const bool synchronous = (synchronizationCallback == nullptr); |
| if (!synchronous) { |
| *synchronizationCallback = nullptr; |
| } |
| |
| const auto deadline = makeDeadline(mTimeoutDuration); |
| |
| // TODO validate that we have full types for all inputs and outputs, |
| // that the graph is not cyclic, |
| |
| auto name = [synchronous, burstBuilder] { |
| return burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute"; |
| }; |
| if (mStarted) { |
| LOG(ERROR) << "ANeuralNetworksExecution_" << name() |
| << " called on an execution that has already started"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| for (auto& p : mInputs) { |
| if (p.state() == ModelArgumentInfo::UNSPECIFIED) { |
| LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all inputs specified"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } else if (p.state() == ModelArgumentInfo::MEMORY) { |
| const Memory* memory = mMemories[p.locationAndLength().poolIndex]; |
| if (!memory->getValidator().validateInputDimensions(p.dimensions())) { |
| return ANEURALNETWORKS_OP_FAILED; |
| } |
| } |
| } |
| for (auto& p : mOutputs) { |
| if (p.state() == ModelArgumentInfo::UNSPECIFIED) { |
| LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all outputs specified"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| } |
| |
| auto wrappedFinish = [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) { |
| return finish(error, outputShapes); |
| }; |
| |
| // TODO: For asynchronous execution, entire plan-based-path should run in an |
| // asynchronous thread -- take the asynchronous thread logic out of |
| // CpuPreparedModel::execute() and use it to wrap the plan-based-path. |
| mStarted = true; |
| const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning); |
| std::shared_ptr<ExecutionPlan::Controller> controller = |
| mPlan->makeController(this, burstBuilder); |
| if (synchronous) { |
| VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)"; |
| sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback(); |
| localSynchronizationCallback->setOnFinish(wrappedFinish); |
| asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline, |
| localSynchronizationCallback); |
| localSynchronizationCallback->wait(); |
| if (mMeasureTiming) { |
| mTiming = localSynchronizationCallback->getTiming(); |
| } |
| return convertErrorStatusToResultCode(localSynchronizationCallback->getStatus()); |
| } else /* asynchronous */ { |
| // TODO: use a thread pool |
| // TODO(mikie): this could have NNTRACE so we could measure the overhead |
| // of spinning up a new thread. |
| |
| // Prepare the callback for asynchronous execution. |
| // sp<ExecutionCallback> object is returned when the |
| // execution has been successfully launched, otherwise a |
| // nullptr is returned. The executionCallback is |
| // abstracted in the NN API as an "event". |
| sp<ExecutionCallback> executionCallback = new ExecutionCallback(); |
| executionCallback->setOnFinish(wrappedFinish); |
| if (DeviceManager::get()->syncExecRuntime()) { |
| VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)"; |
| asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline, |
| executionCallback); |
| } else { |
| VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)"; |
| std::thread asyncExecution( |
| [this, controller, allowFallback, deadline, executionCallback] { |
| asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, |
| deadline, executionCallback); |
| }); |
| executionCallback->bindThread(std::move(asyncExecution)); |
| } |
| *synchronizationCallback = executionCallback; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| } |
| |
| std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const { |
| std::vector<OutputShape> outputShapes(mOutputs.size()); |
| std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(), |
| [](const auto& x) -> OutputShape { |
| return {.dimensions = x.dimensions(), .isSufficient = true}; |
| }); |
| return outputShapes; |
| } |
| |
| // Check if the dimensions "to" is updatable by dimensions "from", where "from" must |
| // have a higher specification level. |
| static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) { |
| if (to.size() == 0) return true; |
| NN_RET_CHECK_EQ(to.size(), from.size()); |
| for (uint32_t i = 0; i < to.size(); i++) { |
| NN_RET_CHECK(to[i] == from[i] || to[i] == 0); |
| } |
| return true; |
| } |
| |
| bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) { |
| if (outputShapes.size() == 0) { |
| return true; |
| } |
| NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size()); |
| for (uint32_t i = 0; i < outputShapes.size(); i++) { |
| // Check if only unspecified dimensions or rank are overwritten. |
| NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions)); |
| } |
| for (uint32_t i = 0; i < outputShapes.size(); i++) { |
| mOutputs[i].dimensions() = outputShapes[i].dimensions; |
| mOutputs[i].isSufficient() = outputShapes[i].isSufficient; |
| } |
| return true; |
| } |
| |
| bool ExecutionBuilder::updateMemories() { |
| for (const auto& output : mOutputs) { |
| if (output.state() != ModelArgumentInfo::MEMORY) continue; |
| const Memory* memory = mMemories[output.locationAndLength().poolIndex]; |
| NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()})); |
| } |
| return true; |
| } |
| |
| ErrorStatus ExecutionBuilder::finish(ErrorStatus status, |
| const std::vector<OutputShape>& outputShapes) { |
| CHECK(!mFinished) << "ExecutionBuilder::finish is called twice"; |
| mFinished = true; |
| if (!updateOutputShapes(outputShapes) || !updateMemories()) { |
| status = ErrorStatus::GENERAL_FAILURE; |
| } |
| bool success = status == ErrorStatus::NONE; |
| for (const auto& output : mOutputs) { |
| if (output.state() != ModelArgumentInfo::MEMORY) continue; |
| const Memory* memory = mMemories[output.locationAndLength().poolIndex]; |
| memory->getValidator().setInitialized(success); |
| } |
| return status; |
| } |
| |
| bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from, |
| std::vector<OutputShape>* to) { |
| if (from.size() == 0) { |
| return true; |
| } |
| if (mExecutionStep != nullptr) { |
| const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel(); |
| NN_RET_CHECK_LE(indexMapping.size(), from.size()); |
| for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) { |
| uint32_t toIndex = indexMapping[i]; |
| NN_RET_CHECK_GT(to->size(), toIndex); |
| NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions)); |
| (*to)[toIndex] = from[i]; |
| } |
| } else { |
| NN_RET_CHECK_EQ(from.size(), to->size()); |
| for (uint32_t i = 0, e = from.size(); i < e; i++) { |
| NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions)); |
| (*to)[i] = from[i]; |
| } |
| } |
| return true; |
| } |
| |
| StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model, |
| std::shared_ptr<Device> device, |
| std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step) |
| : mExecutionBuilder(executionBuilder), |
| mExecutionStep(step), |
| mModel(model), |
| mDevice(device), |
| mPreparedModel(preparedModel), |
| mInputs(model->inputCount()), |
| mOutputs(model->outputCount()) { |
| CHECK(mDevice != nullptr); |
| VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and " |
| << mOutputs.size() << " outputs"; |
| } |
| |
| void StepExecutor::mapInputsAndOutputsTrivially() { |
| mInputs = mExecutionBuilder->mInputs; |
| mOutputs = mExecutionBuilder->mOutputs; |
| mMemories = mExecutionBuilder->mMemories; |
| } |
| |
| void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput, |
| ModelArgumentInfo* executorInputOrOutput) { |
| *executorInputOrOutput = builderInputOrOutput; |
| switch (executorInputOrOutput->state()) { |
| default: |
| CHECK(false) << "unexpected ModelArgumentInfo::state"; |
| break; |
| case ModelArgumentInfo::HAS_NO_VALUE: |
| case ModelArgumentInfo::POINTER: |
| case ModelArgumentInfo::UNSPECIFIED: |
| break; |
| case ModelArgumentInfo::MEMORY: { |
| const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex; |
| const Memory* memory = mExecutionBuilder->mMemories[builderPoolIndex]; |
| const uint32_t executorPoolIndex = mMemories.add(memory); |
| executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex; |
| break; |
| } |
| } |
| } |
| |
| int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, |
| const Memory* memory, uint32_t offset, |
| ModelArgumentInfo* inputOrOutputInfo) { |
| // Should be similar to |
| // ExecutionBuilder::setInputFromMemory() |
| // ExecutionBuilder::setOutputFromMemory() |
| |
| uint32_t poolIndex = mMemories.add(memory); |
| uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand); |
| CHECK(inputOrOutputInfo->unspecified()); |
| int n; |
| std::tie(n, *inputOrOutputInfo) = |
| ModelArgumentInfo::createFromMemory(inputOrOutputOperand, |
| /*type=*/nullptr, poolIndex, offset, length); |
| return n; |
| } |
| |
| static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) { |
| for (unsigned i = 0; i < args.size(); i++) { |
| const auto& arg = args[i]; |
| std::string prefix = kind + std::string("[") + std::to_string(i) + "] = "; |
| switch (arg.state()) { |
| case ModelArgumentInfo::POINTER: |
| VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ")"; |
| break; |
| case ModelArgumentInfo::MEMORY: |
| VLOG(EXECUTION) << prefix << "MEMORY(" |
| << "pool=" << arg.locationAndLength().poolIndex << ", " |
| << "off=" << arg.locationAndLength().offset << ")"; |
| break; |
| case ModelArgumentInfo::HAS_NO_VALUE: |
| VLOG(EXECUTION) << prefix << "HAS_NO_VALUE"; |
| break; |
| case ModelArgumentInfo::UNSPECIFIED: |
| VLOG(EXECUTION) << prefix << "UNSPECIFIED"; |
| break; |
| default: |
| VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")"; |
| break; |
| } |
| } |
| } |
| |
| bool StepExecutor::isCpu() const { |
| return mDevice == DeviceManager::getCpuDevice(); |
| } |
| |
| static OptionalTimeoutDuration makeTimeoutDuration(uint64_t nanoseconds) { |
| OptionalTimeoutDuration otd; |
| otd.nanoseconds(nanoseconds); |
| return otd; |
| } |
| |
| std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute( |
| const std::optional<Deadline>& deadline, |
| const std::shared_ptr<ExecutionBurstController>& burstController) { |
| CHECK(mPreparedModel != nullptr); |
| |
| if (VLOG_IS_ON(EXECUTION)) { |
| logArguments("input", mInputs); |
| logArguments("output", mOutputs); |
| } |
| |
| const MeasureTiming measure = measureTiming(mExecutionBuilder); |
| const OptionalTimeoutDuration loopTimeoutDuration = |
| makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); |
| const auto [n, outputShapes, timing] = mPreparedModel->execute( |
| mInputs, mOutputs, mMemories, burstController, measure, deadline, loopTimeoutDuration); |
| mExecutionBuilder->reportTiming(timing); |
| |
| return {n, std::move(outputShapes), timing}; |
| } |
| |
| std::tuple<int, int, sp<hal::IFencedExecutionCallback>> StepExecutor::computeFenced( |
| const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, |
| const std::optional<Deadline>& deadline) { |
| CHECK(mPreparedModel != nullptr); |
| |
| if (VLOG_IS_ON(EXECUTION)) { |
| logArguments("input", mInputs); |
| logArguments("output", mOutputs); |
| } |
| |
| const MeasureTiming measure = measureTiming(mExecutionBuilder); |
| const OptionalTimeoutDuration loopTimeoutDuration = |
| makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); |
| OptionalTimeoutDuration optionalTimeoutDurationAfterFence; |
| if (timeoutDurationAfterFence > 0) { |
| optionalTimeoutDurationAfterFence.nanoseconds(timeoutDurationAfterFence); |
| } |
| const auto [n, syncFence, computeFencedCallback, timing] = |
| mPreparedModel->executeFenced(mInputs, mOutputs, mMemories, waitFor, measure, deadline, |
| loopTimeoutDuration, optionalTimeoutDurationAfterFence); |
| if (syncFence < 0 && computeFencedCallback == nullptr) { |
| mExecutionBuilder->reportTiming(timing); |
| } |
| return {n, syncFence, computeFencedCallback}; |
| } |
| |
| // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute. |
| std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() { |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback"); |
| VLOG(EXECUTION) << "Re-compile the model on CPU"; |
| mDevice = DeviceManager::getCpuDevice(); |
| mPreparedModel = nullptr; |
| const ModelFactory makeModel = [this] { return mModel->makeHidlModel(); }; |
| // TODO: Propagate user preference and compilation priority to this point instead of using |
| // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and |
| // ANEURALNETWORKS_PRIORITY_MEDIUM |
| const ExecutionPreference preference = |
| static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER); |
| const Priority priority = convertToHalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT); |
| const auto [n, preparedModel] = |
| mDevice->prepareModel(makeModel, preference, priority, {}, {}, {}); |
| mPreparedModel = preparedModel; |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| return {n, {}, kNoTiming}; |
| } |
| return compute({}, /*burstController=*/nullptr); |
| } |
| |
| } // namespace nn |
| } // namespace android |