Delete simple reduction implementation.

Bug: 27298560
Change-Id: I8c3d568e98aaf0b7d86881c985d13ed5b8e95338
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 011b8e3..8fefe88 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -45,7 +45,7 @@
 using namespace android;
 using namespace android::renderscript;
 
-#define REDUCE_NEW_ALOGV(mtls, level, ...) do { if ((mtls)->logReduce >= (level)) ALOGV(__VA_ARGS__); } while(0)
+#define REDUCE_ALOGV(mtls, level, ...) do { if ((mtls)->logReduce >= (level)) ALOGV(__VA_ARGS__); } while(0)
 
 static pthread_key_t gThreadTLSKey = 0;
 static uint32_t gThreadTLSKeyCount = 0;
@@ -354,7 +354,7 @@
 //   mtls - The MTLaunchStruct holding information about the kernel launch
 //   redp - The reduce parameters (driver info structure)
 //   x, y, z - The start offsets into each dimension
-static inline void RedpPtrSetup(const MTLaunchStructReduceNew *mtls, RsExpandKernelDriverInfo *redp,
+static inline void RedpPtrSetup(const MTLaunchStructReduce *mtls, RsExpandKernelDriverInfo *redp,
                                 uint32_t x, uint32_t y, uint32_t z) {
     for (uint32_t i = 0; i < redp->inLen; i++) {
         redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
@@ -508,8 +508,8 @@
   return *outBuf;
 }
 
-static void reduce_new_get_accumulator(uint8_t *&accumPtr, const MTLaunchStructReduceNew *mtls,
-                                       const char *walkerName, uint32_t threadIdx) {
+static void reduce_get_accumulator(uint8_t *&accumPtr, const MTLaunchStructReduce *mtls,
+                                   const char *walkerName, uint32_t threadIdx) {
   rsAssert(!accumPtr);
 
   uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
@@ -522,8 +522,8 @@
       accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
     }
   }
-  REDUCE_NEW_ALOGV(mtls, 2, "%s(%p): idx = %u got accumCount %u and accumPtr %p",
-                   walkerName, mtls->accumFunc, threadIdx, accumIdx, accumPtr);
+  REDUCE_ALOGV(mtls, 2, "%s(%p): idx = %u got accumCount %u and accumPtr %p",
+               walkerName, mtls->accumFunc, threadIdx, accumIdx, accumPtr);
   // initialize accumulator
   if (mtls->initFunc) {
     mtls->initFunc(accumPtr);
@@ -532,18 +532,18 @@
   }
 }
 
-static void walk_1d_reduce_new(void *usr, uint32_t idx) {
-  const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+static void walk_1d_reduce(void *usr, uint32_t idx) {
+  const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
   RsExpandKernelDriverInfo redp = mtls->redp;
 
   // find accumulator
   uint8_t *&accumPtr = mtls->accumPtr[idx];
   if (!accumPtr) {
-    reduce_new_get_accumulator(accumPtr, mtls, __func__, idx);
+    reduce_get_accumulator(accumPtr, mtls, __func__, idx);
   }
 
   // accumulate
-  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
   while (1) {
     uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
     uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
@@ -566,23 +566,23 @@
     } else {
       fmt[0] = 0;
     }
-    REDUCE_NEW_ALOGV(mtls, 2, "walk_1d_reduce_new(%p): idx = %u, x in [%u, %u)%s",
-                     mtls->accumFunc, idx, xStart, xEnd, fmt);
+    REDUCE_ALOGV(mtls, 2, "walk_1d_reduce(%p): idx = %u, x in [%u, %u)%s",
+                 mtls->accumFunc, idx, xStart, xEnd, fmt);
   }
 }
 
-static void walk_2d_reduce_new(void *usr, uint32_t idx) {
-  const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+static void walk_2d_reduce(void *usr, uint32_t idx) {
+  const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
   RsExpandKernelDriverInfo redp = mtls->redp;
 
   // find accumulator
   uint8_t *&accumPtr = mtls->accumPtr[idx];
   if (!accumPtr) {
-    reduce_new_get_accumulator(accumPtr, mtls, __func__, idx);
+    reduce_get_accumulator(accumPtr, mtls, __func__, idx);
   }
 
   // accumulate
-  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
   while (1) {
     uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
     uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
@@ -605,23 +605,23 @@
     } else {
       fmt[0] = 0;
     }
-    REDUCE_NEW_ALOGV(mtls, 2, "walk_2d_reduce_new(%p): idx = %u, y in [%u, %u)%s",
-                     mtls->accumFunc, idx, yStart, yEnd, fmt);
+    REDUCE_ALOGV(mtls, 2, "walk_2d_reduce(%p): idx = %u, y in [%u, %u)%s",
+                 mtls->accumFunc, idx, yStart, yEnd, fmt);
   }
 }
 
-static void walk_3d_reduce_new(void *usr, uint32_t idx) {
-  const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+static void walk_3d_reduce(void *usr, uint32_t idx) {
+  const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
   RsExpandKernelDriverInfo redp = mtls->redp;
 
   // find accumulator
   uint8_t *&accumPtr = mtls->accumPtr[idx];
   if (!accumPtr) {
-    reduce_new_get_accumulator(accumPtr, mtls, __func__, idx);
+    reduce_get_accumulator(accumPtr, mtls, __func__, idx);
   }
 
   // accumulate
-  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
   while (1) {
     uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
 
@@ -640,44 +640,25 @@
     } else {
       fmt[0] = 0;
     }
-    REDUCE_NEW_ALOGV(mtls, 2, "walk_3d_reduce_new(%p): idx = %u, z = %u%s",
-                     mtls->accumFunc, idx, redp.current.z, fmt);
+    REDUCE_ALOGV(mtls, 2, "walk_3d_reduce(%p): idx = %u, z = %u%s",
+                 mtls->accumFunc, idx, redp.current.z, fmt);
   }
 }
 
-// Launch a simple reduce-style kernel.
-// Inputs:
-//  ain:  The allocation that contains the input
-//  aout: The allocation that will hold the output
-//  mtls: Holds launch parameters
-void RsdCpuReferenceImpl::launchReduce(const Allocation *ain,
-                                       Allocation *aout,
-                                       MTLaunchStructReduce *mtls) {
-    const uint32_t xStart = mtls->start.x;
-    const uint32_t xEnd = mtls->end.x;
-
-    if (xStart >= xEnd) {
-      return;
-    }
-
-    const uint32_t startOffset = ain->getType()->getElementSizeBytes() * xStart;
-    mtls->kernel(&mtls->inBuf[startOffset], mtls->outBuf, xEnd - xStart);
-}
-
 // Launch a general reduce-style kernel.
 // Inputs:
 //   ains[0..inLen-1]: Array of allocations that contain the inputs
 //   aout:             The allocation that will hold the output
 //   mtls:             Holds launch parameters
-void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
-                                          uint32_t inLen,
-                                          Allocation * aout,
-                                          MTLaunchStructReduceNew *mtls) {
+void RsdCpuReferenceImpl::launchReduce(const Allocation ** ains,
+                                       uint32_t inLen,
+                                       Allocation * aout,
+                                       MTLaunchStructReduce *mtls) {
   mtls->logReduce = mRSC->props.mLogReduce;
   if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
-    launchReduceNewParallel(ains, inLen, aout, mtls);
+    launchReduceParallel(ains, inLen, aout, mtls);
   } else {
-    launchReduceNewSerial(ains, inLen, aout, mtls);
+    launchReduceSerial(ains, inLen, aout, mtls);
   }
 }
 
@@ -686,12 +667,12 @@
 //   ains[0..inLen-1]: Array of allocations that contain the inputs
 //   aout:             The allocation that will hold the output
 //   mtls:             Holds launch parameters
-void RsdCpuReferenceImpl::launchReduceNewSerial(const Allocation ** ains,
-                                                uint32_t inLen,
-                                                Allocation * aout,
-                                                MTLaunchStructReduceNew *mtls) {
-  REDUCE_NEW_ALOGV(mtls, 1, "launchReduceNewSerial(%p): %u x %u x %u", mtls->accumFunc,
-                   mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z);
+void RsdCpuReferenceImpl::launchReduceSerial(const Allocation ** ains,
+                                             uint32_t inLen,
+                                             Allocation * aout,
+                                             MTLaunchStructReduce *mtls) {
+  REDUCE_ALOGV(mtls, 1, "launchReduceSerial(%p): %u x %u x %u", mtls->accumFunc,
+               mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z);
 
   // In the presence of outconverter, we allocate temporary memory for
   // the accumulator.
@@ -710,7 +691,7 @@
   }
 
   // accumulate
-  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
   uint32_t slice = 0;
   while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
     for (mtls->redp.current.y = mtls->start.y;
@@ -733,13 +714,13 @@
 //   ains[0..inLen-1]: Array of allocations that contain the inputs
 //   aout:             The allocation that will hold the output
 //   mtls:             Holds launch parameters
-void RsdCpuReferenceImpl::launchReduceNewParallel(const Allocation ** ains,
-                                                  uint32_t inLen,
-                                                  Allocation * aout,
-                                                  MTLaunchStructReduceNew *mtls) {
+void RsdCpuReferenceImpl::launchReduceParallel(const Allocation ** ains,
+                                               uint32_t inLen,
+                                               Allocation * aout,
+                                               MTLaunchStructReduce *mtls) {
   // For now, we don't know how to go parallel in the absence of a combiner.
   if (!mtls->combFunc) {
-    launchReduceNewSerial(ains, inLen, aout, mtls);
+    launchReduceSerial(ains, inLen, aout, mtls);
     return;
   }
 
@@ -777,19 +758,19 @@
 
   rsAssert(!mInKernel);
   mInKernel = true;
-  REDUCE_NEW_ALOGV(mtls, 1, "launchReduceNewParallel(%p): %u x %u x %u, %u threads, accumAlloc = %p",
-                   mtls->accumFunc,
-                   mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z,
-                   numThreads, mtls->accumAlloc);
+  REDUCE_ALOGV(mtls, 1, "launchReduceParallel(%p): %u x %u x %u, %u threads, accumAlloc = %p",
+               mtls->accumFunc,
+               mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z,
+               numThreads, mtls->accumAlloc);
   if (mtls->redp.dim.z > 1) {
     mtls->mSliceSize = 1;
-    launchThreads(walk_3d_reduce_new, mtls);
+    launchThreads(walk_3d_reduce, mtls);
   } else if (mtls->redp.dim.y > 1) {
     mtls->mSliceSize = rsMax(1U, mtls->redp.dim.y / (numThreads * 4));
-    launchThreads(walk_2d_reduce_new, mtls);
+    launchThreads(walk_2d_reduce, mtls);
   } else {
     mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
-    launchThreads(walk_1d_reduce_new, mtls);
+    launchThreads(walk_1d_reduce, mtls);
   }
   mInKernel = false;
 
@@ -804,12 +785,12 @@
         if (mtls->combFunc) {
           if (mtls->logReduce >= 3) {
             FormatBuf fmt;
-            REDUCE_NEW_ALOGV(mtls, 3, "launchReduceNewParallel(%p): accumulating into%s",
-                             mtls->accumFunc,
-                             format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
-            REDUCE_NEW_ALOGV(mtls, 3, "launchReduceNewParallel(%p):    accumulator[%d]%s",
-                             mtls->accumFunc, idx,
-                             format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
+            REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): accumulating into%s",
+                         mtls->accumFunc,
+                         format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+            REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p):    accumulator[%d]%s",
+                         mtls->accumFunc, idx,
+                         format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
           }
           mtls->combFunc(finalAccumPtr, thisAccumPtr);
         } else {
@@ -823,8 +804,8 @@
   rsAssert(finalAccumPtr != nullptr);
   if (mtls->logReduce >= 3) {
     FormatBuf fmt;
-    REDUCE_NEW_ALOGV(mtls, 3, "launchReduceNewParallel(%p): final accumulator%s",
-                     mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+    REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final accumulator%s",
+                 mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
   }
 
   // Outconvert
@@ -832,9 +813,9 @@
     mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
     if (mtls->logReduce >= 3) {
       FormatBuf fmt;
-      REDUCE_NEW_ALOGV(mtls, 3, "launchReduceNewParallel(%p): final outconverted result%s",
-                       mtls->accumFunc,
-                       format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
+      REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final outconverted result%s",
+                   mtls->accumFunc,
+                   format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
     }
   }
 
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 62882aa..1515b77 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -32,22 +32,21 @@
 extern bool gArchUseSIMD;
 
 // Function types found in RenderScript code
-typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
-typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
-typedef void (*ReduceNewCombinerFunc_t)(uint8_t *accum, const uint8_t *other);
-typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
-typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
+typedef void (*ReduceAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceCombinerFunc_t)(uint8_t *accum, const uint8_t *other);
+typedef void (*ReduceInitializerFunc_t)(uint8_t *accum);
+typedef void (*ReduceOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
 typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
 typedef void (*InvokeFunc_t)(void *params);
 typedef void (*InitOrDtorFunc_t)(void);
 typedef int  (*RootFunc_t)(void);
 
-struct ReduceNewDescription {
-    ReduceNewAccumulatorFunc_t  accumFunc;  // expanded accumulator function
-    ReduceNewInitializerFunc_t  initFunc;   // user initializer function
-    ReduceNewCombinerFunc_t     combFunc;   // user combiner function
-    ReduceNewOutConverterFunc_t outFunc;    // user outconverter function
-    size_t                      accumSize;  // accumulator datum size, in bytes
+struct ReduceDescription {
+    ReduceAccumulatorFunc_t  accumFunc;  // expanded accumulator function
+    ReduceInitializerFunc_t  initFunc;   // user initializer function
+    ReduceCombinerFunc_t     combFunc;   // user combiner function
+    ReduceOutConverterFunc_t outFunc;    // user outconverter function
+    size_t                   accumSize;  // accumulator datum size, in bytes
 };
 
 // Internal driver callback used to execute a kernel
@@ -75,8 +74,7 @@
     RsLaunchDimensions start;
     RsLaunchDimensions end;
     // Points to MTLaunchStructForEach::fep::dim or
-    // MTLaunchStructReduce::inputDim or
-    // MTLaunchStructReduceNew::redp::dim.
+    // MTLaunchStructReduce::redp::dim.
     RsLaunchDimensions *dimPtr;
 };
 
@@ -90,22 +88,15 @@
 };
 
 struct MTLaunchStructReduce : public MTLaunchStructCommon {
-    ReduceFunc_t kernel;
-    const uint8_t *inBuf;
-    uint8_t *outBuf;
-    RsLaunchDimensions inputDim;
-};
-
-struct MTLaunchStructReduceNew : public MTLaunchStructCommon {
     // Driver info structure
     RsExpandKernelDriverInfo redp;
 
     const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
 
-    ReduceNewAccumulatorFunc_t accumFunc;
-    ReduceNewInitializerFunc_t initFunc;
-    ReduceNewCombinerFunc_t combFunc;
-    ReduceNewOutConverterFunc_t outFunc;
+    ReduceAccumulatorFunc_t accumFunc;
+    ReduceInitializerFunc_t initFunc;
+    ReduceCombinerFunc_t combFunc;
+    ReduceOutConverterFunc_t outFunc;
 
     size_t accumSize;  // accumulator datum size in bytes
 
@@ -174,13 +165,9 @@
     void launchForEach(const Allocation **ains, uint32_t inLen, Allocation *aout,
                        const RsScriptCall *sc, MTLaunchStructForEach *mtls);
 
-    // Launch a simple reduce kernel
-    void launchReduce(const Allocation *ain, Allocation *aout,
-                      MTLaunchStructReduce *mtls);
-
     // Launch a general reduce kernel
-    void launchReduceNew(const Allocation ** ains, uint32_t inLen, Allocation *aout,
-                         MTLaunchStructReduceNew *mtls);
+    void launchReduce(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                      MTLaunchStructReduce *mtls);
 
     CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir,
                              uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags) override;
@@ -271,10 +258,10 @@
     long mPageSize;
 
     // Launch a general reduce kernel
-    void launchReduceNewSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout,
-                               MTLaunchStructReduceNew *mtls);
-    void launchReduceNewParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout,
-                                 MTLaunchStructReduceNew *mtls);
+    void launchReduceSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                            MTLaunchStructReduce *mtls);
+    void launchReduceParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                              MTLaunchStructReduce *mtls);
 };
 
 
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index ca9a4b6..3d5e635 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -272,7 +272,6 @@
 #define EXPORT_FUNC_STR "exportFuncCount: "
 #define EXPORT_FOREACH_STR "exportForEachCount: "
 #define EXPORT_REDUCE_STR "exportReduceCount: "
-#define EXPORT_REDUCE_NEW_STR "exportReduceNewCount: "
 #define OBJECT_SLOT_STR "objectSlotCount: "
 #define PRAGMA_STR "pragmaCount: "
 #define THREADABLE_STR "isThreadable: "
@@ -311,7 +310,6 @@
     size_t funcCount = 0;
     size_t forEachCount = 0;
     size_t reduceCount = 0;
-    size_t reduceNewCount = 0;
     size_t objectSlotCount = 0;
     size_t pragmaCount = 0;
     bool isThreadable = true;
@@ -322,8 +320,7 @@
     InvokeFunc_t* invokeFunctions = nullptr;
     ForEachFunc_t* forEachFunctions = nullptr;
     uint32_t* forEachSignatures = nullptr;
-    ReduceFunc_t* reduceFunctions = nullptr;
-    ReduceNewDescription* reduceNewDescriptions = nullptr;
+    ReduceDescription* reduceDescriptions = nullptr;
     const char ** pragmaKeys = nullptr;
     const char ** pragmaValues = nullptr;
     uint32_t checksum = 0;
@@ -455,56 +452,21 @@
         }
     }
 
-    // Read simple reduce kernels
-    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-        goto error;
-    }
-    if (sscanf(line, EXPORT_REDUCE_STR "%zu", &reduceCount) != 1) {
-        ALOGE("Invalid export reduce count!: %s", line);
-        goto error;
-    }
-
-    reduceFunctions = new ReduceFunc_t[reduceCount];
-    if (reduceFunctions == nullptr) {
-        goto error;
-    }
-
-    for (size_t i = 0; i < reduceCount; ++i) {
-        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-            goto error;
-        }
-        char *c = strrchr(line, '\n');
-        if (c) {
-            *c = '\0';
-        }
-
-        // Lookup the expanded reduce kernel.
-        strncat(line, ".expand", MAXLINESTR-strlen(line));
-
-        reduceFunctions[i] =
-            reinterpret_cast<ReduceFunc_t>(dlsym(sharedObj, line));
-        if (reduceFunctions[i] == nullptr) {
-            ALOGE("Failed to get function address for %s(): %s",
-                  line, dlerror());
-            goto error;
-        }
-    }
-
     // Read general reduce kernels
     if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
         goto error;
     }
-    if (sscanf(line, EXPORT_REDUCE_NEW_STR "%zu", &reduceNewCount) != 1) {
+    if (sscanf(line, EXPORT_REDUCE_STR "%zu", &reduceCount) != 1) {
         ALOGE("Invalid export reduce new count!: %s", line);
         goto error;
     }
 
-    reduceNewDescriptions = new ReduceNewDescription[reduceNewCount];
-    if (reduceNewDescriptions == nullptr) {
+    reduceDescriptions = new ReduceDescription[reduceCount];
+    if (reduceDescriptions == nullptr) {
         goto error;
     }
 
-    for (size_t i = 0; i < reduceNewCount; ++i) {
+    for (size_t i = 0; i < reduceCount; ++i) {
         static const char kNoName[] = ".";
 
         unsigned int tmpSig = 0;
@@ -545,25 +507,25 @@
         // The current implementation does not use the signature
         // or reduce name.
 
-        reduceNewDescriptions[i].accumSize = tmpSize;
+        reduceDescriptions[i].accumSize = tmpSize;
 
         // Process the (optional) initializer.
         if (strcmp(tmpNameInitializer, kNoName)) {
           // Lookup the original user-written initializer.
-          if (!(reduceNewDescriptions[i].initFunc =
-                (ReduceNewInitializerFunc_t) dlsym(sharedObj, tmpNameInitializer))) {
+          if (!(reduceDescriptions[i].initFunc =
+                (ReduceInitializerFunc_t) dlsym(sharedObj, tmpNameInitializer))) {
             ALOGE("Failed to find initializer function address for %s(): %s",
                   tmpNameInitializer, dlerror());
             goto error;
           }
         } else {
-          reduceNewDescriptions[i].initFunc = nullptr;
+          reduceDescriptions[i].initFunc = nullptr;
         }
 
         // Lookup the expanded accumulator.
         strncat(tmpNameAccumulator, ".expand", MAXLINESTR-strlen(tmpNameAccumulator));
-        if (!(reduceNewDescriptions[i].accumFunc =
-              (ReduceNewAccumulatorFunc_t) dlsym(sharedObj, tmpNameAccumulator))) {
+        if (!(reduceDescriptions[i].accumFunc =
+              (ReduceAccumulatorFunc_t) dlsym(sharedObj, tmpNameAccumulator))) {
             ALOGE("Failed to find accumulator function address for %s(): %s",
                   tmpNameAccumulator, dlerror());
             goto error;
@@ -572,27 +534,27 @@
         // Process the (optional) combiner.
         if (strcmp(tmpNameCombiner, kNoName)) {
           // Lookup the original user-written combiner.
-          if (!(reduceNewDescriptions[i].combFunc =
-                (ReduceNewCombinerFunc_t) dlsym(sharedObj, tmpNameCombiner))) {
+          if (!(reduceDescriptions[i].combFunc =
+                (ReduceCombinerFunc_t) dlsym(sharedObj, tmpNameCombiner))) {
             ALOGE("Failed to find combiner function address for %s(): %s",
                   tmpNameCombiner, dlerror());
             goto error;
           }
         } else {
-          reduceNewDescriptions[i].combFunc = nullptr;
+          reduceDescriptions[i].combFunc = nullptr;
         }
 
         // Process the (optional) outconverter.
         if (strcmp(tmpNameOutConverter, kNoName)) {
           // Lookup the original user-written outconverter.
-          if (!(reduceNewDescriptions[i].outFunc =
-                (ReduceNewOutConverterFunc_t) dlsym(sharedObj, tmpNameOutConverter))) {
+          if (!(reduceDescriptions[i].outFunc =
+                (ReduceOutConverterFunc_t) dlsym(sharedObj, tmpNameOutConverter))) {
             ALOGE("Failed to find outconverter function address for %s(): %s",
                   tmpNameOutConverter, dlerror());
             goto error;
           }
         } else {
-          reduceNewDescriptions[i].outFunc = nullptr;
+          reduceDescriptions[i].outFunc = nullptr;
         }
     }
 
@@ -726,8 +688,7 @@
         fieldAddress, fieldIsObject, fieldName, varCount,
         invokeFunctions, funcCount,
         forEachFunctions, forEachSignatures, forEachCount,
-        reduceFunctions, reduceCount,
-        reduceNewDescriptions, reduceNewCount,
+        reduceDescriptions, reduceCount,
         pragmaKeys, pragmaValues, pragmaCount,
         rsGlobalNames, rsGlobalAddresses, rsGlobalSizes, rsGlobalProperties,
         numEntries, isThreadable, checksum);
@@ -745,8 +706,6 @@
     delete[] pragmaKeys;
 #endif  // RS_COMPATIBILITY_LIB
 
-    delete[] reduceFunctions;
-
     delete[] forEachSignatures;
     delete[] forEachFunctions;
 
diff --git a/cpu_ref/rsCpuExecutable.h b/cpu_ref/rsCpuExecutable.h
index 72c352c..90d3759 100644
--- a/cpu_ref/rsCpuExecutable.h
+++ b/cpu_ref/rsCpuExecutable.h
@@ -67,8 +67,7 @@
                      InvokeFunc_t* invokeFunctions, size_t funcCount,
                      ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures,
                      size_t forEachCount,
-                     ReduceFunc_t* reduceFunctions, size_t reduceCount,
-                     ReduceNewDescription *reduceNewDescriptions, size_t reduceNewCount,
+                     ReduceDescription *reduceDescriptions, size_t reduceCount,
                      const char** pragmaKeys, const char** pragmaValues,
                      size_t pragmaCount,
                      const char **globalNames, const void **globalAddresses,
@@ -80,8 +79,7 @@
         mInvokeFunctions(invokeFunctions), mFuncCount(funcCount),
         mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
         mForEachCount(forEachCount),
-        mReduceFunctions(reduceFunctions), mReduceCount(reduceCount),
-        mReduceNewDescriptions(reduceNewDescriptions), mReduceNewCount(reduceNewCount),
+        mReduceDescriptions(reduceDescriptions), mReduceCount(reduceCount),
         mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
         mPragmaCount(pragmaCount), mGlobalNames(globalNames),
         mGlobalAddresses(globalAddresses), mGlobalSizes(globalSizes),
@@ -107,9 +105,7 @@
         delete[] mPragmaValues;
         delete[] mPragmaKeys;
 
-        delete[] mReduceFunctions;
-
-        delete[] mReduceNewDescriptions;
+        delete[] mReduceDescriptions;
 
         delete[] mForEachSignatures;
         delete[] mForEachFunctions;
@@ -136,7 +132,6 @@
     size_t getExportedFunctionCount() const { return mFuncCount; }
     size_t getExportedForEachCount() const { return mForEachCount; }
     size_t getExportedReduceCount() const { return mReduceCount; }
-    size_t getExportedReduceNewCount() const { return mReduceNewCount; }
     size_t getPragmaCount() const { return mPragmaCount; }
 
     void* getFieldAddress(int slot) const { return mFieldAddress[slot]; }
@@ -149,10 +144,8 @@
     ForEachFunc_t getForEachFunction(int slot) const { return mForEachFunctions[slot]; }
     uint32_t getForEachSignature(int slot) const { return mForEachSignatures[slot]; }
 
-    ReduceFunc_t getReduceFunction(int slot) const { return mReduceFunctions[slot]; }
-
-    const ReduceNewDescription* getReduceNewDescription(int slot) const {
-        return &mReduceNewDescriptions[slot];
+    const ReduceDescription* getReduceDescription(int slot) const {
+        return &mReduceDescriptions[slot];
     }
 
     const char ** getPragmaKeys() const { return mPragmaKeys; }
@@ -207,12 +200,9 @@
     uint32_t* mForEachSignatures;
     size_t mForEachCount;
 
-    ReduceFunc_t* mReduceFunctions;
+    ReduceDescription* mReduceDescriptions;
     size_t mReduceCount;
 
-    ReduceNewDescription* mReduceNewDescriptions;
-    size_t mReduceNewCount;
-
     const char ** mPragmaKeys;
     const char ** mPragmaValues;
     size_t mPragmaCount;
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 0400fab..582b342 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -500,7 +500,6 @@
     // Copy info over to runtime
     script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount();
     script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount();
-    script->mHal.info.exportedReduceNewCount = mScriptExec->getExportedReduceNewCount();
     script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount();
     script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount();
     script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();;
@@ -555,52 +554,14 @@
     return true;
 }
 
-// Preliminary work to prepare a simple reduce-style kernel for launch.
-bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation *ain,
-                                       const Allocation *aout,
+// Preliminary work to prepare a general reduce-style kernel for launch.
+bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation ** ains,
+                                       uint32_t inLen,
+                                       const Allocation * aout,
                                        const RsScriptCall *sc,
                                        MTLaunchStructReduce *mtls) {
-    rsAssert(ain && aout);
-    memset(mtls, 0, sizeof(MTLaunchStructReduce));
-    mtls->dimPtr = &mtls->inputDim;
-
-    if (allocationLODIsNull(ain) || allocationLODIsNull(aout)) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-                                     "reduce called with a null allocation");
-        return false;
-    }
-
-    // Set up the dimensions of the input.
-    const Type *inType = ain->getType();
-    mtls->inputDim.x = inType->getDimX();
-    rsAssert(inType->getDimY() == 0);
-
-    if (!setUpMtlsDimensions(mtls, mtls->inputDim, sc)) {
-        return false;
-    }
-
-    mtls->rs = mCtx;
-    // Currently not threaded.
-    mtls->isThreadable = false;
-    mtls->mSliceNum = -1;
-
-    // Set up input and output.
-    mtls->inBuf = static_cast<uint8_t *>(ain->getPointerUnchecked(0, 0));
-    mtls->outBuf = static_cast<uint8_t *>(aout->getPointerUnchecked(0, 0));
-
-    rsAssert(mtls->inBuf && mtls->outBuf);
-
-    return true;
-}
-
-// Preliminary work to prepare a general reduce-style kernel for launch.
-bool RsdCpuScriptImpl::reduceNewMtlsSetup(const Allocation ** ains,
-                                          uint32_t inLen,
-                                          const Allocation * aout,
-                                          const RsScriptCall *sc,
-                                          MTLaunchStructReduceNew *mtls) {
     rsAssert(ains && (inLen >= 1) && aout);
-    memset(mtls, 0, sizeof(MTLaunchStructReduceNew));
+    memset(mtls, 0, sizeof(MTLaunchStructReduce));
     mtls->dimPtr = &mtls->redp.dim;
 
     for (int index = inLen; --index >= 0;) {
@@ -793,29 +754,15 @@
 }
 
 void RsdCpuScriptImpl::invokeReduce(uint32_t slot,
-                                    const Allocation *ain,
+                                    const Allocation ** ains, uint32_t inLen,
                                     Allocation *aout,
                                     const RsScriptCall *sc) {
-    MTLaunchStructReduce mtls;
+  MTLaunchStructReduce mtls;
 
-    if (reduceMtlsSetup(ain, aout, sc, &mtls)) {
-        reduceKernelSetup(slot, &mtls);
-        RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
-        mCtx->launchReduce(ain, aout, &mtls);
-        mCtx->setTLS(oldTLS);
-    }
-}
-
-void RsdCpuScriptImpl::invokeReduceNew(uint32_t slot,
-                                       const Allocation ** ains, uint32_t inLen,
-                                       Allocation *aout,
-                                       const RsScriptCall *sc) {
-  MTLaunchStructReduceNew mtls;
-
-  if (reduceNewMtlsSetup(ains, inLen, aout, sc, &mtls)) {
-    reduceNewKernelSetup(slot, &mtls);
+  if (reduceMtlsSetup(ains, inLen, aout, sc, &mtls)) {
+    reduceKernelSetup(slot, &mtls);
     RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
-    mCtx->launchReduceNew(ains, inLen, aout, &mtls);
+    mCtx->launchReduce(ains, inLen, aout, &mtls);
     mCtx->setTLS(oldTLS);
   }
 }
@@ -829,15 +776,9 @@
 
 void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) {
     mtls->script = this;
-    mtls->kernel = mScriptExec->getReduceFunction(slot);
-    rsAssert(mtls->kernel != nullptr);
-}
-
-void RsdCpuScriptImpl::reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls) {
-    mtls->script = this;
     mtls->redp.slot = slot;
 
-    const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
+    const ReduceDescription *desc = mScriptExec->getReduceDescription(slot);
     mtls->accumFunc = desc->accumFunc;
     mtls->initFunc  = desc->initFunc;   // might legally be nullptr
     mtls->combFunc  = desc->combFunc;   // might legally be nullptr
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 2909dab..94345bd 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -61,15 +61,10 @@
                        const RsScriptCall* sc) override;
 
     void invokeReduce(uint32_t slot,
-                      const Allocation* ain,
+                      const Allocation ** ains, uint32_t inLen,
                       Allocation* aout,
                       const RsScriptCall* sc) override;
 
-    void invokeReduceNew(uint32_t slot,
-                         const Allocation ** ains, uint32_t inLen,
-                         Allocation* aout,
-                         const RsScriptCall* sc) override;
-
     void invokeInit() override;
     void invokeFreeChildren() override;
 
@@ -94,17 +89,11 @@
 
     virtual void forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls);
 
-    // Build an MTLaunchStruct suitable for launching a simple reduce-style kernel.
-    bool reduceMtlsSetup(const Allocation *ain, const Allocation *aout,
-                         const RsScriptCall *sc, MTLaunchStructReduce *mtls);
-    // Finalize an MTLaunchStruct for launching a simple reduce-style kernel.
-    virtual void reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls);
-
     // Build an MTLaunchStruct suitable for launching a general reduce-style kernel.
-    bool reduceNewMtlsSetup(const Allocation ** ains, uint32_t inLen, const Allocation *aout,
-                            const RsScriptCall *sc, MTLaunchStructReduceNew *mtls);
+    bool reduceMtlsSetup(const Allocation ** ains, uint32_t inLen, const Allocation *aout,
+                         const RsScriptCall *sc, MTLaunchStructReduce *mtls);
     // Finalize an MTLaunchStruct for launching a general reduce-style kernel.
-    virtual void reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls);
+    virtual void reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls);
 
     const RsdCpuReference::CpuSymbol * lookupSymbolMath(const char *sym);
     static void * lookupRuntimeStub(void* pContext, char const* name);
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index e226b93..a8d980e 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -59,15 +59,10 @@
                                    const RsScriptCall *sc) = 0;
 
         virtual void invokeReduce(uint32_t slot,
-                                  const Allocation *ain,
+                                  const Allocation ** ains, uint32_t inLen,
                                   Allocation *aout,
                                   const RsScriptCall *sc) = 0;
 
-        virtual void invokeReduceNew(uint32_t slot,
-                                     const Allocation ** ains, uint32_t inLen,
-                                     Allocation *aout,
-                                     const RsScriptCall *sc) = 0;
-
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;