Add a basic implementation of the reduce kernel API to the CPU
reference implementation.

Bug: 22631253

For now, this just runs a serial reduction on one thread.

Change-Id: I34c96d24bb6f44274de72bb53160abcf79d143b0
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 82208db..9cc9b69 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -203,7 +203,7 @@
 
     }
 
-    MTLaunchStruct mtls;
+    MTLaunchStructForEach mtls;
 
     if (fieldDep) {
         for (size_t ct=0; ct < ins.size(); ct++) {
@@ -230,7 +230,7 @@
                           mtls.fep.usrLen, nullptr);
 
             if (launchOK) {
-                mCtx->launchThreads(ains, inLen, outs[ct], nullptr, &mtls);
+                mCtx->launchForEach(ains, inLen, outs[ct], nullptr, &mtls);
             }
 
             si->postLaunch(slot, ains, inLen, outs[ct], nullptr, 0, nullptr);
@@ -280,10 +280,10 @@
         if (si->forEachMtlsSetup(ains, inLen, outs[0], nullptr, 0, nullptr, &mtls)) {
 
             mtls.script = nullptr;
-            mtls.kernel = (void (*)())&scriptGroupRoot;
+            mtls.kernel = &scriptGroupRoot;
             mtls.fep.usr = &sl;
 
-            mCtx->launchThreads(ains, inLen, outs[0], nullptr, &mtls);
+            mCtx->launchForEach(ains, inLen, outs[0], nullptr, &mtls);
         }
 
         for (size_t ct=0; ct < kernels.size(); ct++) {