enable synchronous mode (functional)

Change-Id: I613610013e7e4d1623620ab94d2d25d8a1bd82b3
Bug: 5972398
diff --git a/cpp/RenderScript.cpp b/cpp/RenderScript.cpp
index 4ce4c9b..98ab380 100644
--- a/cpp/RenderScript.cpp
+++ b/cpp/RenderScript.cpp
@@ -54,18 +54,18 @@
     mDev = NULL;
 }
 
-bool RS::init(bool forceCpu) {
-    return RS::init(RS_VERSION, forceCpu);
+bool RS::init(bool forceCpu, bool synchronous) {
+    return RS::init(RS_VERSION, forceCpu, synchronous);
 }
 
-bool RS::init(int targetApi, bool forceCpu) {
+bool RS::init(int targetApi, bool forceCpu, bool synchronous) {
     mDev = rsDeviceCreate();
     if (mDev == 0) {
         ALOGE("Device creation failed");
         return false;
     }
 
-    mContext = rsContextCreate(mDev, 0, targetApi, forceCpu);
+    mContext = rsContextCreate(mDev, 0, targetApi, forceCpu, synchronous);
     if (mContext == 0) {
         ALOGE("Context creation failed");
         return false;
diff --git a/cpp/rsCppStructs.h b/cpp/rsCppStructs.h
index a430c35..a381816 100644
--- a/cpp/rsCppStructs.h
+++ b/cpp/rsCppStructs.h
@@ -43,8 +43,7 @@
     RS();
     virtual ~RS();
 
-    bool init() { return init(false); }
-    bool init(bool forceCpu);
+    bool init(bool forceCpu = false, bool synchronous = false);
 
     void setErrorHandler(ErrorHandlerFunc_t func);
     ErrorHandlerFunc_t getErrorHandler() { return mErrorFunc; }
@@ -59,7 +58,7 @@
     void finish();
 
  private:
-    bool init(int targetApi, bool forceCpu);
+    bool init(int targetApi, bool forceCpu, bool synchronous);
     static void * threadProc(void *);
 
     static bool gInitialized;
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 5ea28d4..e22b730 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -132,6 +132,16 @@
 void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
     mWorkers.mLaunchData = data;
     mWorkers.mLaunchCallback = cbk;
+
+    // fast path for very small launches
+    MTLaunchStruct *mtls = (MTLaunchStruct *)data;
+    if (mtls && mtls->fep.dimY <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
+        if (mWorkers.mLaunchCallback) {
+            mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
+        }
+        return;
+    }
+
     android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
         mWorkers.mLaunchSignals[ct].set();
@@ -140,7 +150,7 @@
     // We use the calling thread as one of the workers so we can start without
     // the delay of the thread wakeup.
     if (mWorkers.mLaunchCallback) {
-       mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
+        mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
     }
 
     while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
diff --git a/rs.h b/rs.h
index e6d3b12..afe5534 100644
--- a/rs.h
+++ b/rs.h
@@ -52,7 +52,7 @@
 void rsDeviceDestroy(RsDevice dev);
 void rsDeviceSetConfig(RsDevice dev, RsDeviceParam p, int32_t value);
 RsContext rsContextCreate(RsDevice dev, uint32_t version, uint32_t sdkVersion);
-RsContext rsContextCreate(RsDevice dev, uint32_t version, uint32_t sdkVersion, bool forceCpu);
+RsContext rsContextCreate(RsDevice dev, uint32_t version, uint32_t sdkVersion, bool forceCpu, bool synchronous);
 RsContext rsContextCreateGL(RsDevice dev, uint32_t version, uint32_t sdkVersion,
                             RsSurfaceConfig sc, uint32_t dpi);
 
diff --git a/rsContext.cpp b/rsContext.cpp
index 221d572..80021d5 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -250,7 +250,9 @@
     Context *rsc = static_cast<Context *>(vrsc);
 #ifndef ANDROID_RS_SERIALIZE
     rsc->mNativeThreadId = gettid();
-    setpriority(PRIO_PROCESS, rsc->mNativeThreadId, ANDROID_PRIORITY_DISPLAY);
+    if (!rsc->isSynchronous()) {
+        setpriority(PRIO_PROCESS, rsc->mNativeThreadId, ANDROID_PRIORITY_DISPLAY);
+    }
     rsc->mThreadPriority = ANDROID_PRIORITY_DISPLAY;
 #endif //ANDROID_RS_SERIALIZE
     rsc->props.mLogTimes = getProp("debug.rs.profile") != 0;
@@ -318,6 +320,11 @@
     }
 
     rsc->mRunning = true;
+
+    if (rsc->isSynchronous()) {
+        return NULL;
+    }
+
     if (!rsc->mIsGraphicsContext) {
         while (!rsc->mExit) {
             rsc->mIO.playCoreCommands(rsc, -1);
@@ -442,17 +449,15 @@
     mIsContextLite = false;
     memset(&watchdog, 0, sizeof(watchdog));
     mForceCpu = false;
-}
-
-Context * Context::createContext(Device *dev, const RsSurfaceConfig *sc) {
-    return createContext(dev, sc, false);
+    mSynchronous = false;
 }
 
 Context * Context::createContext(Device *dev, const RsSurfaceConfig *sc,
-                                 bool forceCpu) {
+                                 bool forceCpu, bool synchronous) {
     Context * rsc = new Context();
 
     rsc->mForceCpu = forceCpu;
+    rsc->mSynchronous = synchronous;
 
     if (!rsc->initContext(dev, sc)) {
         delete rsc;
@@ -500,22 +505,25 @@
 
     timerInit();
     timerSet(RS_TIMER_INTERNAL);
+    if (mSynchronous) {
+        threadProc(this);
+    } else {
+        status = pthread_create(&mThreadId, &threadAttr, threadProc, this);
+        if (status) {
+            ALOGE("Failed to start rs context thread.");
+            return false;
+        }
+        while (!mRunning && (mError == RS_ERROR_NONE)) {
+            usleep(100);
+        }
 
-    status = pthread_create(&mThreadId, &threadAttr, threadProc, this);
-    if (status) {
-        ALOGE("Failed to start rs context thread.");
-        return false;
-    }
-    while (!mRunning && (mError == RS_ERROR_NONE)) {
-        usleep(100);
-    }
+        if (mError != RS_ERROR_NONE) {
+            ALOGE("Errors during thread init");
+            return false;
+        }
 
-    if (mError != RS_ERROR_NONE) {
-        ALOGE("Errors during thread init");
-        return false;
+        pthread_attr_destroy(&threadAttr);
     }
-
-    pthread_attr_destroy(&threadAttr);
     return true;
 }
 
@@ -817,14 +825,15 @@
 
 RsContext rsContextCreate(RsDevice vdev, uint32_t version,
                           uint32_t sdkVersion) {
-    return rsContextCreate(vdev, version, sdkVersion, false);
+    return rsContextCreate(vdev, version, sdkVersion, false, false);
 }
 
 RsContext rsContextCreate(RsDevice vdev, uint32_t version,
-                          uint32_t sdkVersion, bool forceCpu) {
+                          uint32_t sdkVersion, bool forceCpu,
+                          bool synchronous) {
     ALOGV("rsContextCreate dev=%p", vdev);
     Device * dev = static_cast<Device *>(vdev);
-    Context *rsc = Context::createContext(dev, NULL, forceCpu);
+    Context *rsc = Context::createContext(dev, NULL, forceCpu, synchronous);
     if (rsc) {
         rsc->setTargetSdkVersion(sdkVersion);
     }
diff --git a/rsContext.h b/rsContext.h
index 61218da..bc0c63e 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -67,8 +67,7 @@
     };
     Hal mHal;
 
-    static Context * createContext(Device *, const RsSurfaceConfig *sc);
-    static Context * createContext(Device *, const RsSurfaceConfig *sc, bool forceCpu);
+    static Context * createContext(Device *, const RsSurfaceConfig *sc, bool forceCpu = false, bool synchronous = false);
     static Context * createContextLite();
     ~Context();
 
@@ -104,6 +103,8 @@
     ScriptCState mScriptC;
     FBOCache mFBOCache;
 
+    bool isSynchronous() {return mSynchronous;}
+
     void swapBuffers();
     void setRootScript(Script *);
     void setProgramRaster(ProgramRaster *);
@@ -247,6 +248,7 @@
     Context();
     bool initContext(Device *, const RsSurfaceConfig *sc);
 
+    bool mSynchronous;
     bool initGLThread();
     void deinitEGL();
 
diff --git a/rsg_generator.c b/rsg_generator.c
index c404c9c..7022bcb 100644
--- a/rsg_generator.c
+++ b/rsg_generator.c
@@ -224,6 +224,29 @@
             }
             fprintf(f, ");\n");
         } else {
+            // handle synchronous path
+            fprintf(f, "    if (((Context *)rsc)->isSynchronous()) {\n");
+            fprintf(f, "        ");
+            if (api->ret.typeName[0]) {
+                fprintf(f, "return ");
+            }
+            fprintf(f, "rsi_%s(", api->name);
+            if (!api->nocontext) {
+                fprintf(f, "(Context *)rsc");
+            }
+            for (ct2=0; ct2 < api->paramCount; ct2++) {
+                const VarType *vt = &api->params[ct2];
+                if (ct2 > 0 || !api->nocontext) {
+                    fprintf(f, ", ");
+                }
+                fprintf(f, "%s", vt->name);
+            }
+            fprintf(f, ");\n");
+            if (!api->ret.typeName[0]) {
+                fprintf(f, "    return;");
+            }
+            fprintf(f, "    }\n\n");
+
             fprintf(f, "    ThreadIO *io = &((Context *)rsc)->mIO;\n");
             fprintf(f, "    const uint32_t size = sizeof(RS_CMD_%s);\n", api->name);
             if (hasInlineDataPointers(api)) {
diff --git a/tests/latency/latency.cpp b/tests/latency/latency.cpp
index 124fb20..86d1a29 100644
--- a/tests/latency/latency.cpp
+++ b/tests/latency/latency.cpp
@@ -11,6 +11,7 @@
     int iters = 100;
     int numElems = 1000;
     bool forceCpu = false;
+    bool synchronous = false;
 
     if (argc >= 2) {
         iters = atoi(argv[1]);
@@ -36,14 +37,23 @@
             forceCpu = true;
     }
 
+    if (argc >= 5) {
+        int temp = atoi(argv[4]);
+        if (temp != 0)
+            synchronous = true;
+    }
+
     if (forceCpu)
         printf("forcing CPU\n");
 
+    if (synchronous)
+        printf("forcing synchronous\n");
+
     printf("numElems = %d\n", numElems);
 
     sp<RS> rs = new RS();
 
-    bool r = rs->init(forceCpu); // force CPU execution
+    bool r = rs->init(forceCpu, synchronous);
 
     sp<const Element> e = Element::U32(rs);
 
@@ -51,6 +61,8 @@
     tb.setX(numElems);
     sp<const Type> t = tb.create();
 
+    uint32_t *buf = new uint32_t[numElems];
+
     sp<Allocation> ain = Allocation::createTyped(rs, t);
     sp<Allocation> aout = Allocation::createTyped(rs, t);
 
@@ -72,6 +84,21 @@
     printf("elapsed time : %lld microseconds\n", elapsed);
     printf("time per iter: %f microseconds\n", (double)elapsed / iters);
 
+    gettimeofday(&start, NULL);
+
+    for (int i = 0; i < iters; i++) {
+        ain->copy1DFrom(buf);
+        sc->forEach_root(ain, aout);
+        aout->copy1DTo(buf);
+    }
+
+    rs->finish();
+
+    gettimeofday(&stop, NULL);
+    elapsed = (stop.tv_sec * 1000000) - (start.tv_sec * 1000000) + (stop.tv_usec - start.tv_usec);
+    printf("elapsed time with copy : %lld microseconds\n", elapsed);
+    printf("time per iter with copy: %f microseconds\n", (double)elapsed / iters);
+
     sc.clear();
     t.clear();
     e.clear();