add histogram intrinsic

Change-Id: I42c297bfe116ea29cf015680fcc2143ff4cc95d2
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 67e0786..4038d3a 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -27,6 +27,7 @@
 	rsCpuIntrinsicColorMatrix.cpp \
 	rsCpuIntrinsicConvolve3x3.cpp \
 	rsCpuIntrinsicConvolve5x5.cpp \
+	rsCpuIntrinsicHistogram.cpp \
 	rsCpuIntrinsicLUT.cpp \
 	rsCpuIntrinsicYuvToRGB.cpp
 
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 3e8f45b..5bfc69d 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -477,6 +477,8 @@
                                                 const Script *s, const Element *e);
 extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
                                              const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx,
+                                                 const Script *s, const Element *e);
 
 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
                                     RsScriptIntrinsicID iid, Element *e) {
@@ -507,6 +509,9 @@
     case RS_SCRIPT_INTRINSIC_ID_BLEND:
         i = rsdIntrinsic_Blend(this, s, e);
         break;
+    case RS_SCRIPT_INTRINSIC_ID_HISTOGRAM:
+        i = rsdIntrinsic_Histogram(this, s, e);
+        break;
 
     default:
         rsAssert(0);
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 450ee30..cda40f0 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -73,6 +73,16 @@
 }
 
 
+void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation * ain,
+                                      Allocation * aout, const void * usr,
+                                      uint32_t usrLen, const RsScriptCall *sc) {
+}
+
+void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation * ain,
+                                       Allocation * aout, const void * usr,
+                                       uint32_t usrLen, const RsScriptCall *sc) {
+}
+
 void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
                                           const Allocation * ain,
                                           Allocation * aout,
@@ -81,6 +91,8 @@
                                           const RsScriptCall *sc) {
 
     MTLaunchStruct mtls;
+    preLaunch(slot, ain, aout, usr, usrLen, sc);
+
     forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
     mtls.script = this;
     mtls.fep.slot = slot;
@@ -91,6 +103,8 @@
     RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
     mCtx->launchThreads(ain, aout, sc, &mtls);
     mCtx->setTLS(oldTLS);
+
+    postLaunch(slot, ain, aout, usr, usrLen, sc);
 }
 
 void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index 35ffc69..917b235 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -40,6 +40,13 @@
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
+    virtual void preLaunch(uint32_t slot, const Allocation * ain,
+                           Allocation * aout, const void * usr,
+                           uint32_t usrLen, const RsScriptCall *sc);
+    virtual void postLaunch(uint32_t slot, const Allocation * ain,
+                            Allocation * aout, const void * usr,
+                            uint32_t usrLen, const RsScriptCall *sc);
+
     virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
     virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
                                   const Element *e, const size_t *dims, size_t dimLength);
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
new file mode 100644
index 0000000..a1ad4e0
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicHistogram : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicHistogram();
+    RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
+
+protected:
+    void preLaunch(uint32_t slot, const Allocation * ain,
+                   Allocation * aout, const void * usr,
+                   uint32_t usrLen, const RsScriptCall *sc);
+    void postLaunch(uint32_t slot, const Allocation * ain,
+                    Allocation * aout, const void * usr,
+                    uint32_t usrLen, const RsScriptCall *sc);
+
+
+    float mDot[4];
+    int mDotI[4];
+    int *mSums;
+    ObjectBaseRef<Allocation> mAllocOut;
+
+    static void kernelP1U4(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+    static void kernelP1L(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+    static void kernelP1U1(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+
+};
+
+}
+}
+
+void RsdCpuScriptIntrinsicHistogram::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 1);
+    mAllocOut.set(static_cast<Allocation *>(data));
+}
+
+void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
+    rsAssert(slot == 0);
+    rsAssert(dataLength == 16);
+    memcpy(mDot, data, 16);
+    mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
+    mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
+    mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
+    mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
+}
+
+
+
+void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
+                                      Allocation * aout, const void * usr,
+                                      uint32_t usrLen, const RsScriptCall *sc) {
+
+    const uint32_t threads = mCtx->getThreadCount();
+    const uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
+
+    switch (slot) {
+    case 0:
+        if (ain->getType()->getElement()->getVectorSize() == 1) {
+            mRootPtr = &kernelP1U1;
+        } else {
+            mRootPtr = &kernelP1U4;
+        }
+        break;
+    case 1:
+        mRootPtr = &kernelP1L;
+        break;
+    }
+    memset(mSums, 0, 256 * 4 * threads * vSize);
+}
+
+void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
+                                       Allocation * aout, const void * usr,
+                                       uint32_t usrLen, const RsScriptCall *sc) {
+
+    unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
+    uint32_t threads = mCtx->getThreadCount();
+    uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
+
+    for (uint32_t ct=0; ct < (256 * vSize); ct++) {
+        o[ct] = mSums[ct];
+        for (uint32_t t=1; t < threads; t++) {
+            o[ct] += mSums[ct + 256 * vSize];
+        }
+    }
+}
+
+void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
+    uchar4 *in = (uchar4 *)p->in;
+    int * sums = &cp->mSums[256 * 4 * p->lid];
+
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[(in[0].x << 2)    ] ++;
+        sums[(in[0].y << 2) + 1] ++;
+        sums[(in[0].z << 2) + 2] ++;
+        sums[(in[0].w << 2) + 3] ++;
+        in ++;
+    }
+}
+
+void RsdCpuScriptIntrinsicHistogram::kernelP1L(const RsForEachStubParamStruct *p,
+                                               uint32_t xstart, uint32_t xend,
+                                               uint32_t instep, uint32_t outstep) {
+
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
+    uchar4 *in = (uchar4 *)p->in;
+    int * sums = &cp->mSums[256 * p->lid];
+
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (cp->mDotI[0] * in[0].x) +
+                (cp->mDotI[1] * in[0].y) +
+                (cp->mDotI[2] * in[0].z) +
+                (cp->mDotI[3] * in[0].w);
+        sums[t >> 8] ++;
+        in ++;
+    }
+}
+
+void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+
+}
+
+
+RsdCpuScriptIntrinsicHistogram::RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx,
+                                                     const Script *s, const Element *e)
+            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
+
+    mRootPtr = NULL;
+    mSums = new int[256 * 4 * mCtx->getThreadCount()];
+    mDot[0] = 0.299f;
+    mDot[1] = 0.587f;
+    mDot[2] = 0.114f;
+    mDot[3] = 0;
+    mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
+    mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
+    mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
+    mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
+}
+
+RsdCpuScriptIntrinsicHistogram::~RsdCpuScriptIntrinsicHistogram() {
+    if (mSums) {
+        delete []mSums;
+    }
+}
+
+void RsdCpuScriptIntrinsicHistogram::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 2;
+}
+
+void RsdCpuScriptIntrinsicHistogram::invokeFreeChildren() {
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
+
+    return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
+}
+
+
diff --git a/rsDefines.h b/rsDefines.h
index 6e080de..0287f67 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -363,7 +363,8 @@
     RS_SCRIPT_INTRINSIC_ID_BLUR = 5,
     RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB = 6,
     RS_SCRIPT_INTRINSIC_ID_BLEND = 7,
-    RS_SCRIPT_INTRINSIC_ID_3DLUT = 8
+    RS_SCRIPT_INTRINSIC_ID_3DLUT = 8,
+    RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9
 };
 
 typedef struct {