[automerger skipped] Merge changes Iede89032,I105222e4,I42d1d587,Ic6d491a3,I10e7e93d, ... am: 23c6cd6735 am: a9e8bd6fb8 am: 1a51b54180 -s ours
am skip reason: Change-Id Iede89032f584c94c7083069280c9afe1abb5df1e with SHA-1 3533c30adb is in history

Change-Id: Idaac624c74dfbbb55ea7d8bd4693ab4b2f32f996
diff --git a/nn/apex/AndroidManifest.xml b/nn/apex/AndroidManifest.xml
index 255d2a5..635e27c 100644
--- a/nn/apex/AndroidManifest.xml
+++ b/nn/apex/AndroidManifest.xml
@@ -23,7 +23,6 @@
     -->
   <uses-sdk
       android:minSdkVersion="29"
-      android:maxSdkVersion="30"
-      android:targetSdkVersion="30"
+      android:targetSdkVersion="29"
   />
 </manifest>
diff --git a/nn/apex/manifest.json b/nn/apex/manifest.json
index 4f70840..0afdd5a 100644
--- a/nn/apex/manifest.json
+++ b/nn/apex/manifest.json
@@ -1,4 +1,4 @@
 {
   "name": "com.android.neuralnetworks",
-  "version": 1
+  "version": 3
 }
diff --git a/nn/runtime/ExecutionBuilder.cpp b/nn/runtime/ExecutionBuilder.cpp
index da498ac..2c14a07 100644
--- a/nn/runtime/ExecutionBuilder.cpp
+++ b/nn/runtime/ExecutionBuilder.cpp
@@ -24,6 +24,7 @@
 #include <optional>
 #include <string>
 #include <thread>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -600,7 +601,7 @@
         return true;
     }
     if (mExecutionStep != nullptr) {
-        const auto& indexMapping = mExecutionStep->getOutputIndexSubModelToFromModel();
+        const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
         NN_RET_CHECK_LE(indexMapping.size(), from.size());
         for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
             uint32_t toIndex = indexMapping[i];
diff --git a/nn/runtime/ExecutionBuilder.h b/nn/runtime/ExecutionBuilder.h
index 3d8ab3e..ca7a089 100644
--- a/nn/runtime/ExecutionBuilder.h
+++ b/nn/runtime/ExecutionBuilder.h
@@ -146,8 +146,8 @@
     // executionBuilder
     //     Describes the full (possibly multiple-"step") execution.
     // model
-    //     The model to be executed by the executor.  Possibly a
-    //     submodel of the model from executionBuilder.
+    //     The model to be executed by the executor.  Possibly a single
+    //     "step" model of a multiple-"step" executionBuilder.
     // driver, preparedModel
     //     The device on which to execute the "step", and the prepared
     //     model to execute on that device.  (Both are nullptr in the
diff --git a/nn/runtime/ExecutionPlan.cpp b/nn/runtime/ExecutionPlan.cpp
index 9722a97..16654fd 100644
--- a/nn/runtime/ExecutionPlan.cpp
+++ b/nn/runtime/ExecutionPlan.cpp
@@ -148,8 +148,7 @@
             if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
                 lifetime == OperandLifeTime::MODEL_OUTPUT) {
                 count++;
-                mOperandToOperations.insert(
-                        std::pair<uint32_t, uint32_t>(operandIndex, operationIndex));
+                mOperandToOperations.emplace(operandIndex, operationIndex);
             }
         }
         if (count == 0) {
@@ -177,26 +176,30 @@
 
 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex,
                              std::shared_ptr<Device> device)
-    : mPlan(plan), mIndex(stepIndex), mSubModel(), mDevice(device), mToken(plan->getCacheToken()) {}
+    : mPlan(plan),
+      mIndex(stepIndex),
+      mStepModel(),
+      mDevice(device),
+      mToken(plan->getCacheToken()) {}
 
 // Adds an operand if it has not been added already.
-// Sets the index in the submodel for the corresponding operand.
-int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex,
-                              const ModelBuilder& fromModel, OperandKind kind) {
+// Sets the index in the step model for the corresponding operand.
+int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* toOperandIndex,
+                              const ModelBuilder& sourceModel, OperandKind kind) {
     // Have we added this operand already?
-    auto i = mOperandMap.find(fromOperandIndex);
+    auto i = mOperandMap.find(sourceOperandIndex);
     if (i != mOperandMap.end()) {
-        nnAssert(kind == INPUT);
+        CHECK(kind == INPUT);
         *toOperandIndex = i->second;
         return ANEURALNETWORKS_NO_ERROR;
     }
 
     // First time we add this operand.
-    *toOperandIndex = mSubModel.operandCount();
-    mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex));
+    *toOperandIndex = mStepModel.operandCount();
+    mOperandMap.emplace(sourceOperandIndex, *toOperandIndex);
 
-    // Add the operand to the submodel.
-    const Operand& operand = fromModel.getOperand(fromOperandIndex);
+    // Add the operand to the step model.
+    const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
     ANeuralNetworksOperandType type = {
             .type = static_cast<int32_t>(operand.type),
             .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
@@ -205,13 +208,13 @@
             .zeroPoint = operand.zeroPoint,
     };
 
-    int n = mSubModel.addOperand(type);
+    int n = mStepModel.addOperand(type);
     if (n != ANEURALNETWORKS_NO_ERROR) {
         LOG(ERROR) << "Previous error occurred when partitioning the graph";
         return n;
     }
 
-    n = copyOperandExtraParams(mSubModel, *toOperandIndex, operand);
+    n = copyOperandExtraParams(mStepModel, *toOperandIndex, operand);
     if (n != ANEURALNETWORKS_NO_ERROR) {
         LOG(ERROR) << "Error when copying extra parameters to the operand";
         return n;
@@ -220,16 +223,16 @@
     // Sets its value.
     switch (operand.lifetime) {
         case OperandLifeTime::CONSTANT_COPY: {
-            const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset);
-            n = mSubModel.setOperandValue(*toOperandIndex, data, operand.location.length);
+            const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
+            n = mStepModel.setOperandValue(*toOperandIndex, data, operand.location.length);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
                 return n;
             }
         } break;
         case OperandLifeTime::CONSTANT_REFERENCE: {
-            const Memory* memory = fromModel.getMemories()[operand.location.poolIndex];
-            n = mSubModel.setOperandValueFromMemory(
+            const Memory* memory = sourceModel.getMemories()[operand.location.poolIndex];
+            n = mStepModel.setOperandValueFromMemory(
                     *toOperandIndex, memory, operand.location.offset, operand.location.length);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
@@ -237,7 +240,7 @@
             }
         } break;
         case OperandLifeTime::NO_VALUE: {
-            n = mSubModel.setOperandValue(*toOperandIndex, nullptr, 0);
+            n = mStepModel.setOperandValue(*toOperandIndex, nullptr, 0);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
                 return n;
@@ -248,40 +251,39 @@
                 // The first time we've seen this operand is as an
                 // input.  That means it must be defined by a
                 // different partition, and is an input to this one.
-                mTempsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
+                mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *toOperandIndex);
             } else {
                 // The first time we've seen this operand is as an
                 // output.  It may be an input to a different
                 // partition, so keep track of it.
-                mPlan->recordTemporaryDef(fromOperandIndex, mIndex);
+                mPlan->recordTemporaryDef(sourceOperandIndex, mIndex);
             }
             break;
         case OperandLifeTime::MODEL_INPUT:
-            mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
+            mModelInputs.emplace_back(sourceOperandIndex, *toOperandIndex);
             break;
         case OperandLifeTime::MODEL_OUTPUT:  // handled similarly to TEMPORARY_VARIABLE
             if (kind == INPUT) {
                 // The first time we've seen this operand is as an
                 // input.  That means it must be defined by a
                 // different partition, and is an input to this one.
-                mOutputsAsSubModelInputs.push_back(
-                        std::make_pair(fromOperandIndex, *toOperandIndex));
+                mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *toOperandIndex);
             } else {
                 // The first time we've seen this operand is as an
                 // output.
-                mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
+                mModelOutputs.emplace_back(sourceOperandIndex, *toOperandIndex);
             }
             break;
         default:
-            nnAssert(false);
+            CHECK(false);
             break;
     }
 
     return ANEURALNETWORKS_NO_ERROR;
 }
 
-int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) {
-    const Operation& operation = fromModel.getOperation(operationIndex);
+int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& sourceModel) {
+    const Operation& operation = sourceModel.getOperation(operationIndex);
     if (mToken.ok()) {
         mToken.update(&operationIndex, sizeof(operationIndex));
     }
@@ -299,13 +301,13 @@
     std::vector<uint32_t> inputs(inputCount);
     std::vector<uint32_t> outputs(outputCount);
 
-    auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands,
-                                          std::vector<uint32_t>& localOperands,
-                                          OperandKind kind) -> int {
+    auto addOperands = [this, &sourceModel](const hidl_vec<uint32_t>& globalOperands,
+                                            std::vector<uint32_t>& localOperands,
+                                            OperandKind kind) -> int {
         const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size());
         for (uint32_t i = 0; i < operandCount; i++) {
             uint32_t localOperand = ~0U;
-            int n = addOperand(globalOperands[i], &localOperand, fromModel, kind);
+            int n = addOperand(globalOperands[i], &localOperand, sourceModel, kind);
             if (n != ANEURALNETWORKS_NO_ERROR) return n;
             localOperands[i] = localOperand;
         }
@@ -318,34 +320,71 @@
         return n;
     }
 
-    return mSubModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
-                                  outputCount, outputs.data());
+    return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
+                                   outputCount, outputs.data());
 }
 
-void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const {
-    for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) {
-        stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i);
+void ExecutionStep::mapInputsAndOutputs(
+        std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
+        const std::map<uint32_t, uint32_t>& sourceOperandToOffsetOfTemporary,
+        const std::map<uint32_t, uint32_t>& sourceOperandToInputIndex,
+        const std::map<uint32_t, uint32_t>& sourceOperandToOutputIndex) const {
+    auto mapInput = [&](uint32_t sourceOperandIndex, uint32_t stepInputIndex) {
+        if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
+            it != sourceOperandToOffsetOfTemporary.end()) {
+            executor->setInputFromTemporaryMemory(stepInputIndex, temporaryMemory, it->second);
+        } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
+                   it != sourceOperandToInputIndex.end()) {
+            executor->mapInput(it->second, stepInputIndex);
+        } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
+                   it != sourceOperandToOutputIndex.end()) {
+            executor->mapOutputToInput(it->second, stepInputIndex);
+        } else {
+            CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
+                         << sourceOperandIndex;
+        }
+    };
+    auto mapOutput = [&](uint32_t sourceOperandIndex, uint32_t stepOutputIndex) {
+        if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
+            it != sourceOperandToOffsetOfTemporary.end()) {
+            executor->setOutputFromTemporaryMemory(stepOutputIndex, temporaryMemory, it->second);
+        } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
+                   it != sourceOperandToOutputIndex.end()) {
+            executor->mapOutput(it->second, stepOutputIndex);
+        } else {
+            CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
+                         << sourceOperandIndex;
+        }
+    };
+    for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
+        mapInput(mStepModelInputs[i].first, i);
     }
-    for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) {
-        stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i);
+    for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
+        mapOutput(mStepModelOutputs[i].first, i);
     }
 }
 
-void ExecutionPlan::CompoundBody::findTempsAsSubModelOutputs() {
+void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
     for (const auto& step : mSteps) {
-        for (const auto& input : step->getTempsAsSubModelInputs()) {
-            const uint32_t fromModelIndex = input.first;
-            const auto it = mTemporaryToDefiningStep.find(fromModelIndex);
+        for (const auto& input : step->getTempsAsStepModelInputs()) {
+            const uint32_t sourceOperandIndex = input.first;
+            const auto it = mTemporaryToDefiningStep.find(sourceOperandIndex);
             nnAssert(it != mTemporaryToDefiningStep.end());
             const uint32_t stepIndex = it->second;
             nnAssert(stepIndex < mSteps.size());
-            mSteps[stepIndex]->recordTempAsSubModelOutput(fromModelIndex);
+            mSteps[stepIndex]->recordTempAsStepModelOutput(sourceOperandIndex);
         }
     }
 }
 
-void ExecutionStep::logSubModel() const {
-    VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex;
+void ExecutionStep::recordTempAsStepModelOutput(uint32_t sourceOperandIndex) {
+    const auto it = mOperandMap.find(sourceOperandIndex);
+    CHECK(it != mOperandMap.end());
+    mTempsAsStepModelOutputs.emplace(sourceOperandIndex, it->second);
+}
+
+void ExecutionStep::logStepModel() const {
+    VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
 
     auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
         if (!toLog.empty()) {
@@ -365,7 +404,7 @@
         }
         VLOG(COMPILATION) << name << ": " << toLog;
     };
-    auto logRemapSet = [&logRemapEntry](const char* name, const SubModelOutputSetType& set) {
+    auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
         std::string toLog;
         for (const auto& e : set) {
             logRemapEntry(toLog, e);
@@ -373,69 +412,21 @@
         VLOG(COMPILATION) << name << ": " << toLog;
     };
 
+    logRemapVector("step model inputs", mStepModelInputs);
+    logRemapVector("step model outputs", mStepModelOutputs);
     logRemapVector("model inputs", mModelInputs);
     logRemapVector("model outputs", mModelOutputs);
-    logRemapVector("temps as submodel inputs", mTempsAsSubModelInputs);
-    logRemapSet("temps as submodel outputs", mTempsAsSubModelOutputs);
-    logRemapVector("outputs as submodel inputs", mOutputsAsSubModelInputs);
+    logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
+    logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
+    logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
 }
 
-static void convertModelInputsOrOutputs(
-        // IN: mModel{Inputs|Outputs}
-        const ExecutionStep::RemapVectorType& myModelInputsOrOutputs,
-        // IN: fromModel->{input|output}Count()
-        uint32_t fromModelInputOrOutputCount,
-        // IN: fromModel->get{Input|Output}OperandIndex
-        std::function<uint32_t(uint32_t)> fromModelGetInputOrOutputOperandIndex,
-        // OUT: for v : mModel{Inputs|Outputs} : v.second
-        std::vector<uint32_t>* inputsOrOutputs,
-        // OUT: submodel input-or-output index to original model input-or-output index
-        std::vector<uint32_t>* inputOrOutputIndexSubModelToFromModel) {
-    std::map<uint32_t, uint32_t> fromModelIndexMap;  // operand index to input-or-output index
-    for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) {
-        fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i;
-    }
-    for (const auto& myInputOrOutput : myModelInputsOrOutputs) {
-        inputsOrOutputs->push_back(myInputOrOutput.second);
-        const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first];
-        inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex);
-    }
-}
+int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
+                                   int32_t executionPreference) {
+    CHECK(mDevice != nullptr);
 
-int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize,
-                                  int32_t executionPreference) {
-    nnAssert(mDevice != nullptr);
-    if (VLOG_IS_ON(COMPILATION)) {
-        logSubModel();
-    }
-
-    mSubModel.relaxComputationFloat32toFloat16(fromModel->isComputationFloat32RelaxedToFloat16());
-
-    // Input order: mModelInputs, mTempsAsSubModelInputs, mOutputsAsSubModelInputs
-    // Output order: mModelOutputs, mTempsAsSubModelOutputs
-    //
-    // ExecutionPlan::next() depends on these orderings.
-
-    std::vector<uint32_t> inputs;
-    convertModelInputsOrOutputs(
-            mModelInputs, fromModel->inputCount(),
-            [=](uint32_t i) { return fromModel->getInputOperandIndex(i); }, &inputs,
-            &mInputIndexSubModelToFromModel);
-    for (const auto& subModelInput : mTempsAsSubModelInputs) {
-        inputs.push_back(subModelInput.second);
-    }
-    for (const auto& subModelInput : mOutputsAsSubModelInputs) {
-        inputs.push_back(subModelInput.second);
-    }
-
-    std::vector<uint32_t> outputs;
-    convertModelInputsOrOutputs(
-            mModelOutputs, fromModel->outputCount(),
-            [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); }, &outputs,
-            &mOutputIndexSubModelToFromModel);
-    for (const auto& subModelOutput : mTempsAsSubModelOutputs) {
-        outputs.push_back(subModelOutput.second);
-        const Operand& operand = mSubModel.getOperand(subModelOutput.second);
+    for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
+        const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
         if (operand.dimensions.size() == 0) {
             *hasOutputOfUnknownSize = true;
         } else {
@@ -447,80 +438,94 @@
             }
         }
         if (*hasOutputOfUnknownSize) {
-            VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first
-                              << " of original graph) has unknown size: " << toString(operand);
+            VLOG(COMPILATION) << "StepModelOutput (operand#" << stepModelOutput.first
+                              << " of source graph) has unknown size: " << toString(operand);
         }
     }
 
-    {
-        int n = mSubModel.identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(),
-                                                   &outputs[0]);
-        if (n != ANEURALNETWORKS_NO_ERROR) {
-            return n;
-        }
-        n = mSubModel.finish();
-        if (n != ANEURALNETWORKS_NO_ERROR) {
-            return n;
-        }
+    mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
+
+    mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
+    mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
+                            mTempsAsStepModelInputs.end());
+    mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
+                            mOutputsAsStepModelInputs.end());
+
+    mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
+    mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
+                             mTempsAsStepModelOutputs.end());
+
+    std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
+    for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
+        mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
+    }
+    // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
+    // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
+    mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
+    std::transform(mModelOutputs.begin(), mModelOutputs.end(),
+                   mOutputIndexStepModelToMainModel.begin(),
+                   [&mainModelOperandToOutputIndex](auto& e) {
+                       uint32_t sourceOperandIndex = e.first;
+                       return mainModelOperandToOutputIndex[sourceOperandIndex];
+                   });
+
+    if (VLOG_IS_ON(COMPILATION)) {
+        logStepModel();
     }
 
-    {
-        // Compute mOutputsAsSubModelInputsIndexToFromModel.
-
-        std::map<uint32_t, uint32_t> fromModelOperandIndexToOutputIndex;
-        for (unsigned i = 0, e = fromModel->outputCount(); i < e; ++i) {
-            fromModelOperandIndexToOutputIndex[fromModel->getOutputOperandIndex(i)] = i;
-        }
-
-        for (unsigned i = 0, e = mOutputsAsSubModelInputs.size(); i < e; i++) {
-            const uint32_t fromModelOperandIndex = mOutputsAsSubModelInputs[i].first;
-            const auto it = fromModelOperandIndexToOutputIndex.find(fromModelOperandIndex);
-            if (it == fromModelOperandIndexToOutputIndex.end()) {
-                LOG(ERROR) << "Could not find main model output operand " << fromModelOperandIndex
-                           << " in main model output operand list";
-                return ANEURALNETWORKS_BAD_STATE;
-            }
-            mOutputsAsSubModelInputsIndexToFromModel.push_back(it->second);
-        }
-    }
+    std::vector<uint32_t> inputs(mStepModelInputs.size());
+    std::vector<uint32_t> outputs(mStepModelOutputs.size());
+    std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
+                   [](auto& e) { return e.second; });
+    std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
+                   [](auto& e) { return e.second; });
+    NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
+                                                           outputs.size(), outputs.data()));
+    NN_RETURN_IF_ERROR(mStepModel.finish());
 
     // TODO: Move compilation elsewhere?
-    VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation on " << mDevice->getName();
-    return compile(*mDevice, mSubModel, executionPreference, *mPlan->getCacheDir(), &mToken,
-                   &mPreparedSubModel);
+    VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
+    return compile(*mDevice, mStepModel, executionPreference, *mPlan->getCacheDir(), &mToken,
+                   &mPreparedStepModel);
 }
 
 void ExecutionStep::dump() const {
     if (VLOG_IS_ON(COMPILATION)) {
         VLOG(COMPILATION) << "ExecutionStep#" << mIndex << " for " << mDevice->getName();
-        logModelToInfo(mSubModel.makeHidlModel());
+        logModelToInfo(mStepModel.makeHidlModel());
     }
 }
 
-int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel,
+int ExecutionPlan::CompoundBody::finish(const ModelBuilder* mainModel,
                                         int32_t executionPreference) {
-    findTempsAsSubModelOutputs();
+    findTempsAsStepModelOutputs();
     for (const auto& step : mSteps) {
-        int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize,
-                                     executionPreference);
+        int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
+                                      executionPreference);
         if (n != ANEURALNETWORKS_NO_ERROR) {
-            VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed";
+            VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
             return n;
         }
     }
-    if (mHasSubModelOutputOfUnknownSize) {
+    if (mHasStepModelOutputOfUnknownSize) {
         VLOG(COMPILATION)
-                << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize";
+                << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
         return ANEURALNETWORKS_OP_FAILED;
     }
 
+    for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
+        mSourceOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
+    }
+    for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
+        mSourceOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
+    }
+
     mSuccessfulFinish = true;
     return ANEURALNETWORKS_NO_ERROR;
 }
 
-int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel,
-                                      int32_t executionPreference) {
-    nnAssert(mDevice != nullptr);
+int ExecutionPlan::SimpleBody::finish(const ModelBuilder*, int32_t executionPreference) {
+    CHECK(mDevice != nullptr);
     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
     const int n =
             compile(*mDevice, *mModel, executionPreference, *mCacheDir, &mToken, &mPreparedModel);
@@ -528,20 +533,19 @@
     return n;
 }
 
-int ExecutionPlan::finish(const ModelBuilder* fromModel, int32_t executionPreference) {
-    nnAssert(mBody != nullptr);
-    return mBody->finish(fromModel, executionPreference);
+int ExecutionPlan::finish(const ModelBuilder* mainModel, int32_t executionPreference) {
+    CHECK(mBody != nullptr);
+    return mBody->finish(mainModel, executionPreference);
 }
 
-ExecutionPlan::Controller::Controller(
-        const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
-        const BurstBuilder* burstBuilder,
-        std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,
-        uint32_t totalSizeOfTemporaries)
+ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
+                                      const BurstBuilder* burstBuilder,
+                                      std::map<uint32_t, uint32_t> sourceOperandToOffsetOfTemporary,
+                                      uint32_t totalSizeOfTemporaries)
     : mPlan(plan),
       mExecutionBuilder(executionBuilder),
       mBurstBuilder(burstBuilder),
-      mSubModelInputsAndOutputs(subModelInputsAndOutputs),
+      mSourceOperandToOffsetOfTemporary(std::move(sourceOperandToOffsetOfTemporary)),
       mNextStepIndex(0),
       mLastStepIndex(kBadStepIndex) {
     if (totalSizeOfTemporaries) {
@@ -567,7 +571,7 @@
             std::vector<std::shared_ptr<ExecutionBurstController>> bursts;
             bursts.reserve(compound()->mSteps.size());
             for (const auto& step : compound()->mSteps) {
-                if (const auto preparedModel = step->getPreparedSubModel()) {
+                if (const auto preparedModel = step->getPreparedStepModel()) {
                     const bool preferPowerOverLatency =
                             (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
                     bursts.push_back(
@@ -599,10 +603,9 @@
 
 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
         ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
-    nnAssert(isValid());
-
+    CHECK(isValid());
     // Create the layout for a Memory object big enough for to hold
-    // every TEMPORARY in the original model that is live across
+    // every TEMPORARY in the source model that is live across
     // partition boundaries.
     //
     // TODO: Rethink this approach for managing temporaries.  Some
@@ -623,33 +626,29 @@
     // what our Memory objects represent.
     //
     uint32_t totalSizeOfTemporaries = 0;
-    std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs;
+    std::map<uint32_t, uint32_t> sourceOperandToOffsetOfTemporary;
     if (mState == COMPOUND) {
-        const ModelBuilder* fromModel = executionBuilder->getModel();
+        const ModelBuilder* mainModel = executionBuilder->getModel();
         for (const auto& step : compound()->mSteps) {
-            for (const auto& output : step->getTempsAsSubModelOutputs()) {
-                const uint32_t fromModelOperandIndex = output.first;
-                const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex);
-                if (subModelInputsAndOutputs == nullptr) {
-                    subModelInputsAndOutputs =
-                            std::make_shared<Controller::SubModelInputsAndOutputsType>();
-                }
-                const uint32_t size = TypeManager::get()->getSizeOfData(fromModelOperand);
+            for (const auto& output : step->getTempsAsStepModelOutputs()) {
+                const uint32_t mainModelOperandIndex = output.first;
+                const Operand& mainModelOperand = mainModel->getOperand(mainModelOperandIndex);
+                const uint32_t size = TypeManager::get()->getSizeOfData(mainModelOperand);
                 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
-                subModelInputsAndOutputs->insert(
-                        std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries));
+                sourceOperandToOffsetOfTemporary.emplace(mainModelOperandIndex,
+                                                         totalSizeOfTemporaries);
                 totalSizeOfTemporaries += size;
             }
         }
-        if (VLOG_IS_ON(EXECUTION) && (subModelInputsAndOutputs != nullptr)) {
-            for (const auto& io : *subModelInputsAndOutputs) {
-                VLOG(EXECUTION) << "temp: origOpndIdx = " << io.first << ", offset = " << io.second;
+        if (VLOG_IS_ON(EXECUTION)) {
+            for (const auto& io : sourceOperandToOffsetOfTemporary) {
+                VLOG(EXECUTION) << "temp: source operand index = " << io.first
+                                << ", offset = " << io.second;
             }
         }
     }
-
     return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder,
-                                                      subModelInputsAndOutputs,
+                                                      std::move(sourceOperandToOffsetOfTemporary),
                                                       totalSizeOfTemporaries));
 }
 
@@ -725,73 +724,17 @@
         return ANEURALNETWORKS_NO_ERROR;
     }
 
-    // Input order: model inputs, temps as submodel inputs, outputs as submodel inputs
-    // Output order: model outputs, temps as submodel outputs
-    //
-    // ExecutionStep::finishSubModel() establishes these orderings.
-
     const auto step = compoundBody->mSteps[controller->mNextStepIndex];
-    *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getSubModel(),
-                                               step->getDevice(), step->getPreparedSubModel());
+    *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
+                                               step->getDevice(), step->getPreparedStepModel());
     (*executor)->setExecutionStep(step);
-    step->mapInputsAndOutputs(*executor);
+    step->mapInputsAndOutputs(*executor, controller->mTemporaries.get(),
+                              controller->mSourceOperandToOffsetOfTemporary,
+                              compoundBody->mSourceOperandToInputIndex,
+                              compoundBody->mSourceOperandToOutputIndex);
     if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
         *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
     }
-    if (controller->mSubModelInputsAndOutputs != nullptr) {
-        {
-            // Tell executor about temps as submodel outputs.
-
-            const size_t firstSubModelOutputIndex = step->getModelOutputs().size();
-            const auto& subModelOutputs = step->getTempsAsSubModelOutputs();
-
-            uint32_t idx = 0;
-            for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) {
-                const uint32_t fromModelOperandIndex = I->first;
-                const uint32_t offsetOfTemporary =
-                        controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
-                int n = (*executor)->setOutputFromTemporaryMemory(firstSubModelOutputIndex + idx,
-                                                                  controller->mTemporaries.get(),
-                                                                  offsetOfTemporary);
-                if (n != ANEURALNETWORKS_NO_ERROR) {
-                    controller->mNextStepIndex = Controller::kBadStepIndex;
-                    return n;
-                }
-            }
-        }
-        {
-            // Tell executor about temps as submodel inputs.
-
-            const size_t firstSubModelInputIndex = step->getModelInputs().size();
-            const auto& subModelInputs = step->getTempsAsSubModelInputs();
-
-            uint32_t idx = 0;
-            for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) {
-                const uint32_t fromModelOperandIndex = I->first;
-                const uint32_t offsetOfTemporary =
-                        controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
-                int n = (*executor)->setInputFromTemporaryMemory(firstSubModelInputIndex + idx,
-                                                                 controller->mTemporaries.get(),
-                                                                 offsetOfTemporary);
-                if (n != ANEURALNETWORKS_NO_ERROR) {
-                    controller->mNextStepIndex = Controller::kBadStepIndex;
-                    return n;
-                }
-            }
-        }
-    }
-    {
-        // Tell executor about outputs as submodel inputs.
-
-        const size_t firstOutputsAsSubModelInputIndex =
-                step->getModelInputs().size() + step->getTempsAsSubModelInputs().size();
-        const auto& outputsAsSubModelInputsIndexToFromModel =
-                step->getOutputsAsSubModelInputsIndexToFromModel();
-        for (uint32_t i = 0, e = outputsAsSubModelInputsIndexToFromModel.size(); i < e; i++) {
-            uint32_t o = outputsAsSubModelInputsIndexToFromModel[i];
-            (*executor)->mapOutputToInput(o, firstOutputsAsSubModelInputIndex + i);
-        }
-    }
 
     controller->mNextStepIndex++;
     return ANEURALNETWORKS_NO_ERROR;
@@ -816,6 +759,12 @@
     mState = SIMPLE;
 }
 
+void ExecutionPlan::recordTemporaryDef(uint32_t sourceOperandIndex, uint32_t stepIndex) {
+    auto [it, isNew] = compound()->mTemporaryToDefiningStep.emplace(sourceOperandIndex, stepIndex);
+    CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
+                 << toString(sourceOperandIndex) << " already defined by step " << it->second;
+}
+
 void ExecutionPlan::dump() const {
     if (mBody) {
         mBody->dump();
@@ -860,8 +809,8 @@
     return compound()->mSteps;
 }
 
-bool ExecutionPlan::forTest_hasSubModelOutputsOfUnknownSize() const {
-    return mBody->hasSubModelOutputsOfUnknownSize();
+bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
+    return mBody->hasStepModelOutputsOfUnknownSize();
 }
 
 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
@@ -959,7 +908,7 @@
 
     int n = plan->finish(this, preference);
     if (VLOG_IS_ON(COMPILATION)) {
-        VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: ";
+        VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
         logModelToInfo(makeHidlModel());
         plan->dump();
     }
diff --git a/nn/runtime/ExecutionPlan.h b/nn/runtime/ExecutionPlan.h
index 43c3fa8..e64336b 100644
--- a/nn/runtime/ExecutionPlan.h
+++ b/nn/runtime/ExecutionPlan.h
@@ -51,53 +51,68 @@
 class PreparedModel;
 class StepExecutor;
 
+// NNAPI Control Flow will introduce the ability to refer to an NNAPI model
+// inside another NNAPI model using OperandType::MODEL. For example, a model
+// with an IF condition will refer to two other models corresponding to then
+// and else branches.
+//
+// The following terms are used:
+// - The main model is the top-level model being compiled (not referenced by any
+//   OperandType::MODEL operand within the compilation).
+// - A referenced model is a non-top-level model being compiled (referenced by
+//   at least one OperandType::MODEL operand within the set of models being
+//   compiled).
+// - A source model is either the main model or a referenced model.
+// - A step model is a model excerpted from a source model during the
+//   partitioning process.
+
 class ExecutionStep {
    public:
     typedef std::vector<std::pair<uint32_t, uint32_t>> RemapVectorType;
-    typedef std::set<std::pair<uint32_t, uint32_t>> SubModelOutputSetType;
+    typedef std::set<std::pair<uint32_t, uint32_t>> StepModelOutputSetType;
 
     enum OperandKind { INPUT, OUTPUT };
 
     ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, std::shared_ptr<Device> device);
-    int addOperation(int operationIndex, const ModelBuilder& fromModel);
-    int addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex,
-                   const ModelBuilder& fromModel, OperandKind kind);
 
-    // Each container entry is of the form (fromModel index, subModel index)
+    // For a given ExecutionStep, the sourceModel passed to every method must be
+    // the same.
+    int addOperation(int operationIndex, const ModelBuilder& sourceModel);
+    int addOperand(uint32_t sourceOperandIndex, uint32_t* toOperandIndex,
+                   const ModelBuilder& sourceModel, OperandKind kind);
+
+    // Each container entry is of the form (source model operand index, step model operand index)
     const RemapVectorType& getModelInputs() const { return mModelInputs; }
     const RemapVectorType& getModelOutputs() const { return mModelOutputs; }
-    const RemapVectorType& getTempsAsSubModelInputs() const { return mTempsAsSubModelInputs; }
-    const SubModelOutputSetType& getTempsAsSubModelOutputs() const {
-        return mTempsAsSubModelOutputs;
+    const RemapVectorType& getTempsAsStepModelInputs() const { return mTempsAsStepModelInputs; }
+    const StepModelOutputSetType& getTempsAsStepModelOutputs() const {
+        return mTempsAsStepModelOutputs;
     }
-    const RemapVectorType& getOutputsAsSubModelInputs() const { return mOutputsAsSubModelInputs; }
-    const std::vector<uint32_t>& getOutputIndexSubModelToFromModel() const {
-        return mOutputIndexSubModelToFromModel;
-    }
-    const std::vector<uint32_t>& getOutputsAsSubModelInputsIndexToFromModel() const {
-        return mOutputsAsSubModelInputsIndexToFromModel;
+    const RemapVectorType& getOutputsAsStepModelInputs() const { return mOutputsAsStepModelInputs; }
+    const std::vector<uint32_t>& getOutputIndexStepModelToMainModel() const {
+        return mOutputIndexStepModelToMainModel;
     }
 
-    void recordTempAsSubModelOutput(uint32_t fromModelIndex) {
-        const auto it = mOperandMap.find(fromModelIndex);
-        nnAssert(it != mOperandMap.end());
-        mTempsAsSubModelOutputs.insert(std::make_pair(fromModelIndex, it->second));
-    }
+    void recordTempAsStepModelOutput(uint32_t sourceOperandIndex);
 
-    // If this step has a submodel output of unknown size, sets
+    // If this step has a step model output of unknown size, sets
     // *hasOutputOfUnknownSize to true; otherwise, leaves it
     // unchanged.
-    int finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize,
-                       int32_t executionPreference);
+    int finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
+                        int32_t executionPreference);
 
-    const ModelBuilder* getSubModel() const { return &mSubModel; }
+    const ModelBuilder* getStepModel() const { return &mStepModel; }
     std::shared_ptr<Device> getDevice() const { return mDevice; }
 
-    // only available after calling finishSubModel()
-    std::shared_ptr<PreparedModel> getPreparedSubModel() const { return mPreparedSubModel; }
+    // only available after calling finishStepModel()
+    std::shared_ptr<PreparedModel> getPreparedStepModel() const { return mPreparedStepModel; }
 
     // Map inputs and outputs from ExecutionBuilder to StepExecutor.
-    void mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const;
+    void mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor,
+                             const Memory* temporaryMemory,
+                             const std::map<uint32_t, uint32_t>& sourceOperandToOffsetOfTemporary,
+                             const std::map<uint32_t, uint32_t>& sourceOperandToInputIndex,
+                             const std::map<uint32_t, uint32_t>& sourceOperandToOutputIndex) const;
 
     void dump() const;
 
@@ -105,7 +120,7 @@
     const uint8_t* forTest_getCacheToken() const { return mToken.getCacheToken(); }
 
    private:
-    void logSubModel() const;
+    void logStepModel() const;
 
     // TODO: Some of the data is working state information that
     // shouldn't be needed after we've constructed but not executed
@@ -113,50 +128,53 @@
 
     ExecutionPlan* mPlan;
     uint32_t mIndex;  // index of step within plan
-    ModelBuilder mSubModel;
+    ModelBuilder mStepModel;
     std::shared_ptr<Device> mDevice;
-    std::shared_ptr<PreparedModel> mPreparedSubModel;
+    std::shared_ptr<PreparedModel> mPreparedStepModel;
 
-    // Inputs of original model that are also inputs of this submodel:
-    //     (fromModel index, subModel index)
+    // All inputs of this step model:
+    //     (source model operand index, step model operand index)
+    //
+    // Depending on whether the source operand is an input or output of the main
+    // model, the memory should be mapped using
+    // ExecutionPlan::CompoundBody::mSourceOperandToInputIndex,
+    // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary, or
+    // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex.
+    RemapVectorType mStepModelInputs;
+    // All outputs of this step model:
+    //     (source model operand index, step model operand index)
+    //
+    // Depending on whether the source operand is an output of the main model,
+    // the memory should be mapped using
+    // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex or
+    // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary.
+    //
+    // mOutputIndexStepModelToMainModel relies on mModelOutputs being a prefix of
+    // mStepModelOutputs.
+    RemapVectorType mStepModelOutputs;
+    // Inputs of main model that are also inputs of this step model:
+    //     (main model operand index, step model operand index)
     RemapVectorType mModelInputs;
-    // Outputs of original model that are also outputs of this submodel:
-    //     (fromModel index, subModel index)
+    // Outputs of main model that are also outputs of this step model:
+    //     (main model operand index, step model operand index)
     RemapVectorType mModelOutputs;
-    // Temporaries of original model that are inputs of this submodel:
-    //     (fromModel index, subModel index)
-    RemapVectorType mTempsAsSubModelInputs;
-    // Temporaries of original model that are outputs of this submodel:
-    //     (fromModel index, subModel index)
-    SubModelOutputSetType mTempsAsSubModelOutputs;
-    // Outputs of original model that are inputs of this submodel:
-    //     (fromModel index, subModel index)
-    RemapVectorType mOutputsAsSubModelInputs;
-    // Converts operand indexes from the main model to the submodel.
+    // Temporaries of source model that are inputs of this step model:
+    //     (source model operand index, step model operand index)
+    RemapVectorType mTempsAsStepModelInputs;
+    // Temporaries of source model that are outputs of this step model:
+    //     (source model operand index, step model operand index)
+    StepModelOutputSetType mTempsAsStepModelOutputs;
+    // Outputs of main model that are inputs of this step model:
+    //     (main model operand index, step model operand index)
+    RemapVectorType mOutputsAsStepModelInputs;
+    // Converts operand indexes from the source model to the step model.
     std::unordered_map<uint32_t, uint32_t> mOperandMap;
-    // Converts input indexes from the submodel to the main model
-    // (these are input indexes, not operand indexes).  This vector
-    // only describes inputs of the submodel that are also inputs of
-    // the main model -- that is, mModelInputs but not mTempsAsSubModelInputs.
-    std::vector<uint32_t> mInputIndexSubModelToFromModel;
-    // Converts output indexes from the submodel to the main model
+    // Converts output indexes from the step model to the main model
     // (these are output indexes, not operand indexes).  This vector
-    // only describes outputs of the submodel that are also outputs of
-    // the main model -- that is, mModelOutputs but not mTempsAsSubModelOutputs.
-    std::vector<uint32_t> mOutputIndexSubModelToFromModel;
-    // Converts indexes into mOutputsAsSubModelInputs to indexes into
-    // main model outputs (these are input and output indexes, not
-    // operand indexes).  To be specific, if the main model outputs
-    // are mainModelOutputs,
-    //
-    //     mOutputsAsSubModelInputsIndexToFromModel.size() ==
-    //     mOutputsAsSubModelInputs.size()
-    //
-    // and when (0 <= i < mOutputsAsSubModelInputs.size()),
-    //
-    //     mainModelOutputs[mOutputsAsSubModelInputsIndexToFromModel[i]] ==
-    //     mOutputsAsSubModelInputs[i].first
-    std::vector<uint32_t> mOutputsAsSubModelInputsIndexToFromModel;
+    // only describes outputs of the step model that are also outputs of
+    // the main model -- that is, mModelOutputs but not
+    // mTempsAsStepModelOutputs.
+    std::vector<uint32_t> mOutputIndexStepModelToMainModel;
 
     // The compilation caching token.
     TokenHasher mToken;
@@ -188,23 +206,19 @@
         Controller(const Controller&) = delete;
         Controller& operator=(const Controller&) = delete;
 
-        // Map from the operand index of a TEMPORARY in the original
-        // model to an offset into mTemporaries used to represent that
-        // TEMPORARY as an inter-partition input or output.
-        typedef std::map<uint32_t, uint32_t> SubModelInputsAndOutputsType;
-
         static const size_t kBadStepIndex = ~size_t(0);
 
         Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
                    const BurstBuilder* burstBuilder,
-                   std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,
+                   std::map<uint32_t, uint32_t> sourceOperandToOffsetOfTemporary,
                    uint32_t totalSizeOfTemporaries);
 
         const ExecutionPlan* mPlan;
         ExecutionBuilder* mExecutionBuilder;
         const BurstBuilder* mBurstBuilder;
-        std::shared_ptr<const SubModelInputsAndOutputsType>
-                mSubModelInputsAndOutputs;  // may be nullptr
+        // Map from source operand index to an offset into mTemporaries used
+        // to represent that operand as an inter-partition input or output.
+        const std::map<uint32_t, uint32_t> mSourceOperandToOffsetOfTemporary;
         std::unique_ptr<MemoryAshmem> mTemporaries;
         size_t mNextStepIndex;
         size_t mLastStepIndex;  // For fallback.
@@ -226,13 +240,9 @@
 
     void becomeSingleStep(const std::shared_ptr<Device> device, const ModelBuilder* model);
 
-    int finish(const ModelBuilder* fromModel, int32_t executionPreference);
+    int finish(const ModelBuilder* mainModel, int32_t executionPreference);
 
-    void recordTemporaryDef(uint32_t fromModelIndex, uint32_t stepIndex) {
-        auto& temporaryToDefiningStep = compound()->mTemporaryToDefiningStep;
-        nnAssert(temporaryToDefiningStep.count(fromModelIndex) == 0);
-        temporaryToDefiningStep.insert(std::make_pair(fromModelIndex, stepIndex));
-    }
+    void recordTemporaryDef(uint32_t sourceOperandIndex, uint32_t stepIndex);
 
     void dump() const;
 
@@ -260,17 +270,17 @@
     Kind forTest_getKind() const;
     std::shared_ptr<const Device> forTest_simpleGetDevice() const;
     const std::vector<std::shared_ptr<ExecutionStep>>& forTest_compoundGetSteps() const;
-    bool forTest_hasSubModelOutputsOfUnknownSize() const;
+    bool forTest_hasStepModelOutputsOfUnknownSize() const;
     const uint8_t* forTest_simpleGetCacheToken() const;
 
    private:
-    void findTempsAsSubModelOutputs();
+    void findTempsAsStepModelOutputs();
 
     struct Body {
         virtual ~Body() {}
         virtual void dump() const = 0;
-        virtual int finish(const ModelBuilder* fromModel, int32_t executionPreference) = 0;
-        virtual bool hasSubModelOutputsOfUnknownSize() const = 0;
+        virtual int finish(const ModelBuilder* mainModel, int32_t executionPreference) = 0;
+        virtual bool hasStepModelOutputsOfUnknownSize() const = 0;
         bool mSuccessfulFinish = false;
     };
 
@@ -280,8 +290,8 @@
             : mDevice(device), mModel(model), mCacheDir(cacheDir), mToken(token) {}
 
         void dump() const override;
-        int finish(const ModelBuilder* fromModel, int32_t executionPreference) override;
-        virtual bool hasSubModelOutputsOfUnknownSize() const override { return false; }
+        int finish(const ModelBuilder* mainModel, int32_t executionPreference) override;
+        virtual bool hasStepModelOutputsOfUnknownSize() const override { return false; }
 
         std::shared_ptr<Device> mDevice;
         const ModelBuilder* mModel;
@@ -293,9 +303,9 @@
 
     struct CompoundBody : Body {
         void dump() const override;
-        int finish(const ModelBuilder* fromModel, int32_t executionPreference) override;
-        virtual bool hasSubModelOutputsOfUnknownSize() const override {
-            return mHasSubModelOutputOfUnknownSize;
+        int finish(const ModelBuilder* mainModel, int32_t executionPreference) override;
+        virtual bool hasStepModelOutputsOfUnknownSize() const override {
+            return mHasStepModelOutputOfUnknownSize;
         }
 
         // TODO: Some of the data is working state information that
@@ -304,14 +314,22 @@
 
         std::vector<std::shared_ptr<ExecutionStep>> mSteps;
 
-        // Map from original operand index to defining step index.
+        // Map from source operand index to defining step index.
         // Used for all (and only) TEMPORARY_VARIABLEs.
         std::unordered_map<uint32_t, uint32_t> mTemporaryToDefiningStep;
 
-        bool mHasSubModelOutputOfUnknownSize = false;
+        // Map from source operand index to input index of the main model.
+        // Used for all (and only) MODEL_INPUTs of the main model.
+        std::map<uint32_t, uint32_t> mSourceOperandToInputIndex;
+
+        // Map from source operand index to output index of the main model.
+        // Used for all (and only) MODEL_OUTPUTs of the main model.
+        std::map<uint32_t, uint32_t> mSourceOperandToOutputIndex;
+
+        bool mHasStepModelOutputOfUnknownSize = false;
 
        private:
-        void findTempsAsSubModelOutputs();
+        void findTempsAsStepModelOutputs();
     };
 
     enum { EMPTY, SIMPLE, COMPOUND } mState = EMPTY;
diff --git a/nn/runtime/VersionedInterfaces.cpp b/nn/runtime/VersionedInterfaces.cpp
index 310f0f9..6260808 100644
--- a/nn/runtime/VersionedInterfaces.cpp
+++ b/nn/runtime/VersionedInterfaces.cpp
@@ -801,14 +801,14 @@
 
     auto remappedResult = [&model](const std::pair<ErrorStatus, hidl_vec<bool>>& result,
                                    const std::function<uint32_t(uint32_t)>&
-                                           submodelOperationIndexToModelOperationIndex) {
+                                           slicedModelOperationIndexToModelOperationIndex) {
         const ErrorStatus status = result.first;
         const hidl_vec<bool>& supported = result.second;
         hidl_vec<bool> remappedSupported(model.operations.size());
         std::fill(remappedSupported.begin(), remappedSupported.end(), false);
         for (size_t i = 0; i < supported.size(); ++i) {
             if (supported[i]) {
-                remappedSupported[submodelOperationIndexToModelOperationIndex(i)] = true;
+                remappedSupported[slicedModelOperationIndexToModelOperationIndex(i)] = true;
             }
         }
         return std::make_pair(status, std::move(remappedSupported));
@@ -835,7 +835,7 @@
     if (getDevice<V1_2::IDevice>() != nullptr) {
         const bool compliant = compliantWithV1_2(model);
         V1_2::Model model12;
-        std::function<uint32_t(uint32_t)> submodelOperationIndexToModelOperationIndex;
+        std::function<uint32_t(uint32_t)> slicedModelOperationIndexToModelOperationIndex;
         if (compliant) {
             model12 = convertToV1_2(model);
         } else {
@@ -843,7 +843,7 @@
             if (!slice12.has_value()) {
                 return noneSupported();
             }
-            std::tie(model12, submodelOperationIndexToModelOperationIndex) = *slice12;
+            std::tie(model12, slicedModelOperationIndexToModelOperationIndex) = *slice12;
         }
         NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "getSupportedOperations_1_2");
         Return<void> ret = recoverable<void, V1_2::IDevice>(
@@ -858,7 +858,7 @@
             return kFailure;
         }
         if (!compliant) {
-            return remappedResult(result, submodelOperationIndexToModelOperationIndex);
+            return remappedResult(result, slicedModelOperationIndexToModelOperationIndex);
         }
         return result;
     }
@@ -867,7 +867,7 @@
     if (getDevice<V1_1::IDevice>() != nullptr) {
         const bool compliant = compliantWithV1_1(model);
         V1_1::Model model11;
-        std::function<uint32_t(uint32_t)> submodelOperationIndexToModelOperationIndex;
+        std::function<uint32_t(uint32_t)> slicedModelOperationIndexToModelOperationIndex;
         if (compliant) {
             model11 = convertToV1_1(model);
         } else {
@@ -875,7 +875,7 @@
             if (!slice11.has_value()) {
                 return noneSupported();
             }
-            std::tie(model11, submodelOperationIndexToModelOperationIndex) = *slice11;
+            std::tie(model11, slicedModelOperationIndexToModelOperationIndex) = *slice11;
         }
         NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "getSupportedOperations_1_1");
         Return<void> ret = recoverable<void, V1_1::IDevice>(
@@ -890,7 +890,7 @@
             return kFailure;
         }
         if (!compliant) {
-            return remappedResult(result, submodelOperationIndexToModelOperationIndex);
+            return remappedResult(result, slicedModelOperationIndexToModelOperationIndex);
         }
         return result;
     }
@@ -899,7 +899,7 @@
     if (getDevice<V1_0::IDevice>() != nullptr) {
         const bool compliant = compliantWithV1_0(model);
         V1_0::Model model10;
-        std::function<uint32_t(uint32_t)> submodelOperationIndexToModelOperationIndex;
+        std::function<uint32_t(uint32_t)> slicedModelOperationIndexToModelOperationIndex;
         if (compliant) {
             model10 = convertToV1_0(model);
         } else {
@@ -907,7 +907,7 @@
             if (!slice10.has_value()) {
                 return noneSupported();
             }
-            std::tie(model10, submodelOperationIndexToModelOperationIndex) = *slice10;
+            std::tie(model10, slicedModelOperationIndexToModelOperationIndex) = *slice10;
         }
         NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "getSupportedOperations");
         Return<void> ret = recoverable<void, V1_0::IDevice>(
@@ -922,7 +922,7 @@
             return kFailure;
         }
         if (!compliant) {
-            return remappedResult(result, submodelOperationIndexToModelOperationIndex);
+            return remappedResult(result, slicedModelOperationIndexToModelOperationIndex);
         }
         return result;
     }
diff --git a/nn/runtime/test/TestExecution.cpp b/nn/runtime/test/TestExecution.cpp
index 3f8e845..4a4bd5e 100644
--- a/nn/runtime/test/TestExecution.cpp
+++ b/nn/runtime/test/TestExecution.cpp
@@ -349,7 +349,7 @@
                                 actualCallback);
     }
 
-private:
+   private:
     ErrorStatus mErrorStatus;
 };
 
diff --git a/nn/runtime/test/TestPartitioning.cpp b/nn/runtime/test/TestPartitioning.cpp
index 5d829c3..bafdd2f 100644
--- a/nn/runtime/test/TestPartitioning.cpp
+++ b/nn/runtime/test/TestPartitioning.cpp
@@ -91,10 +91,10 @@
 // In order to determine whether or not a partitioning matches the
 // expected partitioning, we check the number of partitions, check
 // which device each partition targets, and compare each partition's
-// subgraph, model inputs, model outputs, submodel inputs, and
-// submodel outputs against what is expected.  In order to perform
+// subgraph, model inputs, model outputs, step model inputs, and
+// step model outputs against what is expected.  In order to perform
 // that comparison, we build a model to compare against a partition's
-// submodel and run a graph comparison algorithm on it.  The graph
+// step model and run a graph comparison algorithm on it.  The graph
 // comparison and the inputs and outputs comparisons are syntactic
 // rather than semantic comparisons -- they don't allow for
 // reorderings of inputs and outputs.  Because of this, we need to
@@ -108,20 +108,20 @@
 //   operands in index order (input followed by output) when that
 //   operation is added.  (It does not add an input that has already
 //   been added.)
-// - It finds model inputs, model outputs, and submodel inputs in
+// - It finds model inputs, model outputs, and step model inputs in
 //   the order the corresponding operands were added to the subgraph
 //   (see ExecutionStep methods getModelInputs(), getModelOutputs(),
-//   getTempsAsSubModelInputs(), getOutputsAsSubModelInputs()).
-// - It finds temps as submodel outputs in numerical order of corresponding
+//   getTempsAsStepModelInputs(), getOutputsAsStepModelInputs()).
+// - It finds temps as step model outputs in numerical order of corresponding
 //   operand number in the original model (see ExecutionStep method
-//   getTempsAsSubModelOutputs()).
-// - When it calls identifyInputsAndOutputs() on the submodel, it
+//   getTempsAsStepModelOutputs()).
+// - When it calls identifyInputsAndOutputs() on the step model, it
 //   passes inputs from getModelInputs() in order, followed by temps as
-//   submodel inputs from getTempsAsSubModelInputs() in order,
-//   followed by outputs as submodel inputs from
-//   getOutputsAsSubModelInputs() in order; and it passes outputs from
-//   getModelOutputs() in order followed by submodel outputs from
-//   getTempsAsSubModelOutputs() in order.
+//   step model inputs from getTempsAsStepModelInputs() in order,
+//   followed by outputs as step model inputs from
+//   getOutputsAsStepModelInputs() in order; and it passes outputs from
+//   getModelOutputs() in order followed by step model outputs from
+//   getTempsAsStepModelOutputs() in order.
 //
 // TODO: Maybe the logic for comparing a partition to an expected
 //       model should be changed to tolerate reorderings of inputs and
@@ -129,9 +129,9 @@
 //       against, we don't need to worry about input and output
 //       orderings.  But is there a way to do this that still lets us
 //       verify that we have the correct relationships between
-//       an (original) model's inputs and outputs and each submodel's
+//       an (original) model's inputs and outputs and each step model's
 //       inputs and outputs, as well as the correct relationship
-//       between submodel inputs and outputs across partitions?
+//       between step model inputs and outputs across partitions?
 
 namespace {
 
@@ -758,7 +758,7 @@
 class PartitioningTest : public ::testing::Test {
    protected:
     using RemapVectorType = ExecutionStep::RemapVectorType;
-    using SubModelOutputSetType = ExecutionStep::SubModelOutputSetType;
+    using StepModelOutputSetType = ExecutionStep::StepModelOutputSetType;
 
     virtual void SetUp() {}
 
@@ -1187,22 +1187,22 @@
     // As a side effect of the comparison, we produce a map
     // *inputsAndOutputsModelToStep that maps from each of the model input and
     // output operand numbers of "model" to the corresponding operand numbers of
-    // the submodel from "step".  If the comparison returns false, the contents
+    // the step model from "step".  If the comparison returns false, the contents
     // of the map are undefined.
     bool compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model,
                  std::shared_ptr<Device> device,
                  std::map<uint32_t, uint32_t>* inputsAndOutputsModelToStep) {
         return (step->getDevice() == device) &&
-               compare(step->getSubModel(),
+               compare(step->getStepModel(),
                        reinterpret_cast<const ModelBuilder*>(model->getHandle()),
                        inputsAndOutputsModelToStep);
     }
 
     void compare(std::shared_ptr<const ExecutionStep> step, const PartitioningModel* model,
                  std::shared_ptr<Device> device, const RemapVectorType& modelInputs,
-                 const RemapVectorType& modelOutputs, const RemapVectorType& tempsAsSubModelInputs,
-                 const SubModelOutputSetType& tempsAsSubModelOutputs,
-                 const RemapVectorType& outputsAsSubModelInputs) {
+                 const RemapVectorType& modelOutputs, const RemapVectorType& tempsAsStepModelInputs,
+                 const StepModelOutputSetType& tempsAsStepModelOutputs,
+                 const RemapVectorType& outputsAsStepModelInputs) {
         std::map<uint32_t, uint32_t> inputsAndOutputsModelToStep;
         ASSERT_NO_FATAL_FAILURE(
                 ASSERT_TRUE(compare(step, model, device, &inputsAndOutputsModelToStep)));
@@ -1211,13 +1211,13 @@
         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep, step->getModelOutputs(),
                                         modelOutputs));
         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
-                                        step->getTempsAsSubModelInputs(), tempsAsSubModelInputs));
-        ASSERT_TRUE(compareSubModelOutputSets(inputsAndOutputsModelToStep,
-                                              step->getTempsAsSubModelOutputs(),
-                                              tempsAsSubModelOutputs));
+                                        step->getTempsAsStepModelInputs(), tempsAsStepModelInputs));
+        ASSERT_TRUE(compareStepModelOutputSets(inputsAndOutputsModelToStep,
+                                               step->getTempsAsStepModelOutputs(),
+                                               tempsAsStepModelOutputs));
         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
-                                        step->getOutputsAsSubModelInputs(),
-                                        outputsAsSubModelInputs));
+                                        step->getOutputsAsStepModelInputs(),
+                                        outputsAsStepModelInputs));
     }
 
    private:
@@ -1231,13 +1231,13 @@
         return step == model;
     }
 
-    static bool compareSubModelOutputSets(
+    static bool compareStepModelOutputSets(
             const std::map<uint32_t, uint32_t>& inputsAndOutputsModelToStep,
-            const SubModelOutputSetType& step, const SubModelOutputSetType& model) {
-        SubModelOutputSetType modelTransformed;
+            const StepModelOutputSetType& step, const StepModelOutputSetType& model) {
+        StepModelOutputSetType modelTransformed;
         std::transform(
                 model.begin(), model.end(), std::inserter(modelTransformed, modelTransformed.end()),
-                [&inputsAndOutputsModelToStep](const SubModelOutputSetType::value_type& val) {
+                [&inputsAndOutputsModelToStep](const StepModelOutputSetType::value_type& val) {
                     return std::make_pair(val.first, inputsAndOutputsModelToStep.at(val.second));
                 });
         return step == modelTransformed;
@@ -1279,7 +1279,7 @@
     // Compound partition (two devices, each is capable of one of the
     // two operations).  We could do more extensive checking here --
     // for example, verify that each step within the plan has the
-    // correct (model and submodel)x(inputs and outputs).
+    // correct (model and step model)x(inputs and outputs).
     const auto devicesB = makeDevices({{"0", 0.9, 1 << 0}, {"1", 0.5, 1 << 1}});
     ExecutionPlan planB;
     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER, &planB),
@@ -1288,7 +1288,7 @@
     const auto& stepsB = planB.forTest_compoundGetSteps();
     ASSERT_EQ(stepsB.size(), size_t(2));
     {
-        // Build a model to compare against the submodel from stepsB[0].
+        // Build a model to compare against the step model from stepsB[0].
         PartitioningModel modelB0;
         uint32_t b0Opnd0 = modelB0.addFloatOperand();
         uint32_t b0Opnd1 = modelB0.addFloatOperand();
@@ -1301,31 +1301,31 @@
                 compare(stepsB[0], &modelB0, devicesB[0],
                         RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}},  // modelInputs
                         RemapVectorType{},                                    // modelOutputs
-                        RemapVectorType{},                        // tempsAsSubModelInputs
-                        SubModelOutputSetType{{opnd2, b0Opnd2}},  // tempsAsSubModelOutputs
-                        RemapVectorType{}));                      // outputsAsSubModelInputs;
+                        RemapVectorType{},                         // tempsAsStepModelInputs
+                        StepModelOutputSetType{{opnd2, b0Opnd2}},  // tempsAsStepModelOutputs
+                        RemapVectorType{}));                       // outputsAsStepModelInputs;
     }
     {
-        // Build a model to compare against the submodel from stepsB[1].
+        // Build a model to compare against the step model from stepsB[1].
         PartitioningModel modelB1;
         uint32_t b1Opnd2 = modelB1.addFloatOperand();
         uint32_t b1Opnd3 = modelB1.addFloatOperand();
         uint32_t b1Opnd4 = modelB1.addOperation2To1V1_0(1, b1Opnd2, b1Opnd3);
-        // Note: In the partitioning algorithm, submodel inputs follow
+        // Note: In the partitioning algorithm, step model inputs follow
         // model inputs.  In the original model "model", opnd2 is not
-        // an input; so in the submodel "modelB1", the corresponding
-        // input b1Opnd2 is a submodel input, and must follow the
+        // an input; so in the step model "modelB1", the corresponding
+        // input b1Opnd2 is a step model input, and must follow the
         // model input b1Opnd3.
         modelB1.identifyInputsAndOutputs({b1Opnd3, b1Opnd2}, {b1Opnd4});
         modelB1.finish();
         ASSERT_TRUE(modelB1.isValid());
 
-        ASSERT_NO_FATAL_FAILURE(compare(stepsB[1], &modelB1, devicesB[1],
-                                        RemapVectorType{{opnd3, b1Opnd3}},  // modelInputs
-                                        RemapVectorType{{opnd4, b1Opnd4}},  // modelOutputs
-                                        RemapVectorType{{opnd2, b1Opnd2}},  // tempsAsSubModelInputs
-                                        SubModelOutputSetType{},  // tempsAsSubModelOutputs
-                                        RemapVectorType{}));      // outputsAsSubModelInputs
+        ASSERT_NO_FATAL_FAILURE(compare(
+                stepsB[1], &modelB1, devicesB[1], RemapVectorType{{opnd3, b1Opnd3}},  // modelInputs
+                RemapVectorType{{opnd4, b1Opnd4}},  // modelOutputs
+                RemapVectorType{{opnd2, b1Opnd2}},  // tempsAsStepModelInputs
+                StepModelOutputSetType{},           // tempsAsStepModelOutputs
+                RemapVectorType{}));                // outputsAsStepModelInputs
     }
 }
 
@@ -1366,7 +1366,7 @@
     const auto& stepsB = planB.forTest_compoundGetSteps();
     ASSERT_EQ(stepsB.size(), size_t(3));
     {
-        // Build a model to compare against the submodel from stepsB[0].
+        // Build a model to compare against the step model from stepsB[0].
         PartitioningModel modelB0;
         uint32_t b0Opnd0 = modelB0.addFloatOperand();
         uint32_t b0Opnd1 = modelB0.addFloatOperand();
@@ -1379,12 +1379,12 @@
                 compare(stepsB[0], &modelB0, devicesB[1],
                         RemapVectorType{{opnd0, b0Opnd0}, {opnd1, b0Opnd1}},  // modelInputs
                         RemapVectorType{{opnd4, b0Opnd2}},                    // modelOutputs
-                        RemapVectorType{},        // tempsAsSubModelInputs
-                        SubModelOutputSetType{},  // tempsAsSubModelOutputs
-                        RemapVectorType{}));      // outputsAsSubModelInputs
+                        RemapVectorType{},         // tempsAsStepModelInputs
+                        StepModelOutputSetType{},  // tempsAsStepModelOutputs
+                        RemapVectorType{}));       // outputsAsStepModelInputs
     }
     {
-        // Build a model to compare against the submodel from stepsB[1].
+        // Build a model to compare against the step model from stepsB[1].
         PartitioningModel modelB1;
         uint32_t b1Opnd0 = modelB1.addFloatOperand();
         uint32_t b1Opnd1 = modelB1.addFloatOperand();
@@ -1398,20 +1398,20 @@
                 compare(stepsB[1], &modelB1, devicesB[0],
                         RemapVectorType{{opnd0, b1Opnd0}, {opnd1, b1Opnd1}},  // modelInputs
                         RemapVectorType{{opnd2, b1Opnd2}},                    // modelOutputs
-                        RemapVectorType{},                        // tempsAsSubModelInputs
-                        SubModelOutputSetType{{opnd3, b1Opnd3}},  // tempsAsSubModelOutputs
-                        RemapVectorType{}));                      // outputsAsSubModelInputs
+                        RemapVectorType{},                         // tempsAsStepModelInputs
+                        StepModelOutputSetType{{opnd3, b1Opnd3}},  // tempsAsStepModelOutputs
+                        RemapVectorType{}));                       // outputsAsStepModelInputs
     }
     {
-        // Build a model to compare against the submodel from stepsB[2].
+        // Build a model to compare against the step model from stepsB[2].
         PartitioningModel modelB2;
         uint32_t b2Opnd0 = modelB2.addFloatOperand();
         uint32_t b2Opnd1 = modelB2.addFloatOperand();
         uint32_t b2Opnd2 = modelB2.addOperation2To1V1_2(0, b2Opnd0, b2Opnd1);
         // Note: In the partitioning algorithm, temps that are
-        // submodel inputs precede model outputs that are submodel
+        // step model inputs precede model outputs that are step model
         // inputs.  In the original model "model", opnd3 is a temp and
-        // opnd2 is a model output; so in the submodel "modelB2", the
+        // opnd2 is a model output; so in the step model "modelB2", the
         // corresponding inputs b2Opnd1 and b2Opnd0 must appear in
         // that order.
         modelB2.identifyInputsAndOutputs({b2Opnd1, b2Opnd0}, {b2Opnd2});
@@ -1421,9 +1421,9 @@
         ASSERT_NO_FATAL_FAILURE(
                 compare(stepsB[2], &modelB2, devicesB[2], RemapVectorType{},  // modelInputs
                         RemapVectorType{{opnd5, b2Opnd2}},                    // modelOutputs
-                        RemapVectorType{{opnd3, b2Opnd1}},    // tempsAsSubModelInputs
-                        SubModelOutputSetType{},              // tempsAsSubModelOutputs
-                        RemapVectorType{{opnd2, b2Opnd0}}));  // outputsAsSubModelInputs
+                        RemapVectorType{{opnd3, b2Opnd1}},    // tempsAsStepModelInputs
+                        StepModelOutputSetType{},             // tempsAsStepModelOutputs
+                        RemapVectorType{{opnd2, b2Opnd0}}));  // outputsAsStepModelInputs
     }
 
     // TODO: Make sure this still works when we have multiple devices
@@ -1494,7 +1494,7 @@
     {
         const auto& step0 = steps[0];
 
-        // Build a model to compare against the submodel from steps[0].
+        // Build a model to compare against the step model from steps[0].
         PartitioningModel model0;
         uint32_t m0Opnd0 = model0.addFloatOperand();
         uint32_t m0Opnd1 = model0.addFloatOperand();
@@ -1508,15 +1508,15 @@
                 compare(step0, &model0, devices[0],
                         RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}},  // modelInputs
                         RemapVectorType{},                                    // modelOutputs
-                        RemapVectorType{},  // tempsAsSubModelInputs
-                        SubModelOutputSetType{{opnd2, m0Opnd2},
-                                              {opnd3, m0Opnd3}},  // tempsAsSubModelOutputs
-                        RemapVectorType{}));                      // outputsAsSubModelInputs
+                        RemapVectorType{},  // tempsAsStepModelInputs
+                        StepModelOutputSetType{{opnd2, m0Opnd2},
+                                               {opnd3, m0Opnd3}},  // tempsAsStepModelOutputs
+                        RemapVectorType{}));                       // outputsAsStepModelInputs
     }
     {
         const auto& step1 = steps[1];
 
-        // Build a model to compare against the submodel from steps[1].
+        // Build a model to compare against the step model from steps[1].
         PartitioningModel model1;
         uint32_t m1Opnd0 = model1.addFloatOperand();
         uint32_t m1Opnd3 = model1.addFloatOperand();
@@ -1531,14 +1531,14 @@
                 step1, &model1, DeviceManager::getCpuDevice(),
                 RemapVectorType{{opnd0, m1Opnd0}},                    // modelInputs
                 RemapVectorType{{opnd4, m1Opnd4}},                    // modelOutputs
-                RemapVectorType{{opnd3, m1Opnd3}, {opnd2, m1Opnd2}},  // tempsAsSubModelInputs
-                SubModelOutputSetType{{opnd5, m1Opnd5}},              // tempsAsSubModelOutputs
-                RemapVectorType{}));                                  // outputsAsSubModelInputs
+                RemapVectorType{{opnd3, m1Opnd3}, {opnd2, m1Opnd2}},  // tempsAsStepModelInputs
+                StepModelOutputSetType{{opnd5, m1Opnd5}},             // tempsAsStepModelOutputs
+                RemapVectorType{}));                                  // outputsAsStepModelInputs
     }
     {
         const auto& step2 = steps[2];
 
-        // Build a model to compare against the submodel from steps[2].
+        // Build a model to compare against the step model from steps[2].
         PartitioningModel model2;
         uint32_t m2Opnd3 = model2.addFloatOperand();
         uint32_t m2Opnd5 = model2.addFloatOperand();
@@ -1552,9 +1552,9 @@
         ASSERT_NO_FATAL_FAILURE(compare(
                 step2, &model2, devices[0], RemapVectorType{{opnd6, m2Opnd6}},  // modelInputs
                 RemapVectorType{{opnd8, m2Opnd8}},                              // modelOutputs
-                RemapVectorType{{opnd3, m2Opnd3}, {opnd5, m2Opnd5}},  // tempsAsSubModelInputs
-                SubModelOutputSetType{},                              // tempsAsSubModelOutputs
-                RemapVectorType{}));                                  // outputsAsSubModelInputs
+                RemapVectorType{{opnd3, m2Opnd3}, {opnd5, m2Opnd5}},  // tempsAsStepModelInputs
+                StepModelOutputSetType{},                             // tempsAsStepModelOutputs
+                RemapVectorType{}));                                  // outputsAsStepModelInputs
     }
 }
 
@@ -1608,14 +1608,14 @@
     ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback),
               Result::NO_ERROR);
     ASSERT_EQ(cPWithoutFallback.finish(), Result::OP_FAILED);
-    ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize());
+    ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasStepModelOutputsOfUnknownSize());
     ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
 }
 
 // Regression test for http://b/69166603:
-//     "partitioned compilation and execution yields wrong results when model output is submodel
+//     "partitioned compilation and execution yields wrong results when model output is step model
 //     input"
-TEST_F(PartitioningTest, ModelOutputAsSubmodelInput) {
+TEST_F(PartitioningTest, ModelOutputAsStepModelInput) {
     PartitioningModel model;
     uint32_t opnd0 = model.addFloatOperand();
     uint32_t opnd1 = model.addFloatOperand();
@@ -1628,7 +1628,7 @@
     // Compound partition (two devices, each is capable of one of the
     // two operations).  We could do more extensive checking here --
     // for example, verify that each step within the plan has the
-    // correct (model and submodel)x(inputs and outputs).
+    // correct (model and step model)x(inputs and outputs).
     const auto devices = makeDevices({{"0", 0.5, 1 << 0}, {"1", 0.5, 1 << 1}});
     ExecutionPlan plan;
     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER, &plan),
@@ -1637,7 +1637,7 @@
     const auto& steps = plan.forTest_compoundGetSteps();
     ASSERT_EQ(steps.size(), size_t(2));
     {
-        // Build a model to compare against the submodel from steps[0].
+        // Build a model to compare against the step model from steps[0].
         PartitioningModel model0;
         uint32_t m0Opnd0 = model0.addFloatOperand();
         uint32_t m0Opnd1 = model0.addFloatOperand();
@@ -1649,12 +1649,12 @@
                 compare(steps[0], &model0, devices[0],
                         RemapVectorType{{opnd0, m0Opnd0}, {opnd1, m0Opnd1}},  // modelInputs
                         RemapVectorType{{opnd2, m0Opnd2}},                    // modelOutputs
-                        RemapVectorType{},        // tempsAsSubModelInputs
-                        SubModelOutputSetType{},  // tempsAsSubModelOutputs
-                        RemapVectorType{}));      // outputsAsSubModelInputs
+                        RemapVectorType{},         // tempsAsStepModelInputs
+                        StepModelOutputSetType{},  // tempsAsStepModelOutputs
+                        RemapVectorType{}));       // outputsAsStepModelInputs
     }
     {
-        // Build a model to compare against the submodel from steps[1].
+        // Build a model to compare against the step model from steps[1].
         PartitioningModel model1;
         uint32_t m1Opnd2 = model1.addFloatOperand();
         uint32_t m1Opnd3 = model1.addOperation2To1V1_0(1, m1Opnd2, m1Opnd2);
@@ -1665,9 +1665,9 @@
         ASSERT_NO_FATAL_FAILURE(
                 compare(steps[1], &model1, devices[1], RemapVectorType{},  // modelInputs
                         RemapVectorType{{opnd3, m1Opnd3}},                 // modelOutputs
-                        RemapVectorType{},                                 // tempsAsSubModelInputs
-                        SubModelOutputSetType{},                           // tempsAsSubModelOutputs
-                        RemapVectorType{{opnd2, m1Opnd2}}));  // outputsAsSubModelInputs
+                        RemapVectorType{},                                 // tempsAsStepModelInputs
+                        StepModelOutputSetType{},             // tempsAsStepModelOutputs
+                        RemapVectorType{{opnd2, m1Opnd2}}));  // outputsAsStepModelInputs
     }
 }
 
diff --git a/nn/runtime/test/TestPartitioningRandom.cpp b/nn/runtime/test/TestPartitioningRandom.cpp
index 8a2ecf0..7e75c38 100644
--- a/nn/runtime/test/TestPartitioningRandom.cpp
+++ b/nn/runtime/test/TestPartitioningRandom.cpp
@@ -1152,7 +1152,7 @@
               Result::NO_ERROR);
     auto compilationResult = cNoFallback.finish();
     if (hasUnknownDimensions && compilationResult == Result::OP_FAILED &&
-        cNoFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize()) {
+        cNoFallback.getExecutionPlan().forTest_hasStepModelOutputsOfUnknownSize()) {
         ASSERT_EQ(cWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback),
                   Result::NO_ERROR);
         ASSERT_EQ(cWithFallback.finish(), Result::NO_ERROR);
@@ -1183,7 +1183,7 @@
                 std::cout << "plan: compound, " << steps.size() << " steps over "
                           << devicesInPlan.size() << " devices" << std::endl;
                 for (unsigned i = 0; i < steps.size(); i++) {
-                    std::cout << "Step " << i << ": " << ModelStats(steps[i]->getSubModel())
+                    std::cout << "Step " << i << ": " << ModelStats(steps[i]->getStepModel())
                               << ", device = " << steps[i]->getDevice()->getName() << std::endl;
                 }
                 break;