add histogram intrinsic
Change-Id: I42c297bfe116ea29cf015680fcc2143ff4cc95d2
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 67e0786..4038d3a 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -27,6 +27,7 @@
rsCpuIntrinsicColorMatrix.cpp \
rsCpuIntrinsicConvolve3x3.cpp \
rsCpuIntrinsicConvolve5x5.cpp \
+ rsCpuIntrinsicHistogram.cpp \
rsCpuIntrinsicLUT.cpp \
rsCpuIntrinsicYuvToRGB.cpp
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 3e8f45b..5bfc69d 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -477,6 +477,8 @@
const Script *s, const Element *e);
extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e);
RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
RsScriptIntrinsicID iid, Element *e) {
@@ -507,6 +509,9 @@
case RS_SCRIPT_INTRINSIC_ID_BLEND:
i = rsdIntrinsic_Blend(this, s, e);
break;
+ case RS_SCRIPT_INTRINSIC_ID_HISTOGRAM:
+ i = rsdIntrinsic_Histogram(this, s, e);
+ break;
default:
rsAssert(0);
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 450ee30..cda40f0 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -73,6 +73,16 @@
}
+void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc) {
+}
+
+void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc) {
+}
+
void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
const Allocation * ain,
Allocation * aout,
@@ -81,6 +91,8 @@
const RsScriptCall *sc) {
MTLaunchStruct mtls;
+ preLaunch(slot, ain, aout, usr, usrLen, sc);
+
forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
mtls.script = this;
mtls.fep.slot = slot;
@@ -91,6 +103,8 @@
RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
mCtx->launchThreads(ain, aout, sc, &mtls);
mCtx->setTLS(oldTLS);
+
+ postLaunch(slot, ain, aout, usr, usrLen, sc);
}
void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index 35ffc69..917b235 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -40,6 +40,13 @@
virtual void invokeInit();
virtual void invokeFreeChildren();
+ virtual void preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
+ virtual void postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
+
virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
const Element *e, const size_t *dims, size_t dimLength);
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
new file mode 100644
index 0000000..a1ad4e0
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicHistogram : public RsdCpuScriptIntrinsic {
+public:
+ virtual void populateScript(Script *);
+ virtual void invokeFreeChildren();
+
+ virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+ virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+ virtual ~RsdCpuScriptIntrinsicHistogram();
+ RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
+
+protected:
+ void preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
+ void postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc);
+
+
+ float mDot[4];
+ int mDotI[4];
+ int *mSums;
+ ObjectBaseRef<Allocation> mAllocOut;
+
+ static void kernelP1U4(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep);
+ static void kernelP1L(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep);
+ static void kernelP1U1(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep);
+
+};
+
+}
+}
+
+void RsdCpuScriptIntrinsicHistogram::setGlobalObj(uint32_t slot, ObjectBase *data) {
+ rsAssert(slot == 1);
+ mAllocOut.set(static_cast<Allocation *>(data));
+}
+
+void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
+ rsAssert(slot == 0);
+ rsAssert(dataLength == 16);
+ memcpy(mDot, data, 16);
+ mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
+ mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
+ mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
+ mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
+}
+
+
+
+void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc) {
+
+ const uint32_t threads = mCtx->getThreadCount();
+ const uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
+
+ switch (slot) {
+ case 0:
+ if (ain->getType()->getElement()->getVectorSize() == 1) {
+ mRootPtr = &kernelP1U1;
+ } else {
+ mRootPtr = &kernelP1U4;
+ }
+ break;
+ case 1:
+ mRootPtr = &kernelP1L;
+ break;
+ }
+ memset(mSums, 0, 256 * 4 * threads * vSize);
+}
+
+void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
+ Allocation * aout, const void * usr,
+ uint32_t usrLen, const RsScriptCall *sc) {
+
+ unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
+ uint32_t threads = mCtx->getThreadCount();
+ uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
+
+ for (uint32_t ct=0; ct < (256 * vSize); ct++) {
+ o[ct] = mSums[ct];
+ for (uint32_t t=1; t < threads; t++) {
+ o[ct] += mSums[ct + 256 * vSize];
+ }
+ }
+}
+
+void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep) {
+
+ RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
+ uchar4 *in = (uchar4 *)p->in;
+ int * sums = &cp->mSums[256 * 4 * p->lid];
+
+ for (uint32_t x = xstart; x < xend; x++) {
+ sums[(in[0].x << 2) ] ++;
+ sums[(in[0].y << 2) + 1] ++;
+ sums[(in[0].z << 2) + 2] ++;
+ sums[(in[0].w << 2) + 3] ++;
+ in ++;
+ }
+}
+
+void RsdCpuScriptIntrinsicHistogram::kernelP1L(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep) {
+
+ RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
+ uchar4 *in = (uchar4 *)p->in;
+ int * sums = &cp->mSums[256 * p->lid];
+
+ for (uint32_t x = xstart; x < xend; x++) {
+ int t = (cp->mDotI[0] * in[0].x) +
+ (cp->mDotI[1] * in[0].y) +
+ (cp->mDotI[2] * in[0].z) +
+ (cp->mDotI[3] * in[0].w);
+ sums[t >> 8] ++;
+ in ++;
+ }
+}
+
+void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep) {
+
+}
+
+
+RsdCpuScriptIntrinsicHistogram::RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx,
+ const Script *s, const Element *e)
+ : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
+
+ mRootPtr = NULL;
+ mSums = new int[256 * 4 * mCtx->getThreadCount()];
+ mDot[0] = 0.299f;
+ mDot[1] = 0.587f;
+ mDot[2] = 0.114f;
+ mDot[3] = 0;
+ mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
+ mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
+ mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
+ mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
+}
+
+RsdCpuScriptIntrinsicHistogram::~RsdCpuScriptIntrinsicHistogram() {
+ if (mSums) {
+ delete []mSums;
+ }
+}
+
+void RsdCpuScriptIntrinsicHistogram::populateScript(Script *s) {
+ s->mHal.info.exportedVariableCount = 2;
+}
+
+void RsdCpuScriptIntrinsicHistogram::invokeFreeChildren() {
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
+
+ return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
+}
+
+
diff --git a/rsDefines.h b/rsDefines.h
index 6e080de..0287f67 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -363,7 +363,8 @@
RS_SCRIPT_INTRINSIC_ID_BLUR = 5,
RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB = 6,
RS_SCRIPT_INTRINSIC_ID_BLEND = 7,
- RS_SCRIPT_INTRINSIC_ID_3DLUT = 8
+ RS_SCRIPT_INTRINSIC_ID_3DLUT = 8,
+ RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9
};
typedef struct {