Merge "ContextDestroy should go through FIFO" into nyc-dev
diff --git a/api/rs_convert.spec b/api/rs_convert.spec
index fa23662..931e7ec 100644
--- a/api/rs_convert.spec
+++ b/api/rs_convert.spec
@@ -78,7 +78,7 @@
attrib: const
w: 2, 3, 4
t: f16
-t: u8, u16, u32, u64, i8, i16, i32, i64, f32, f64
+t: u8, u16, u32, u64, i8, i16, i32, i64, f16, f32, f64
ret: #3#1
arg: #2#1 v, compatible(#3)
end:
diff --git a/api/rs_object_types.spec b/api/rs_object_types.spec
index 4336dd2..883aa71 100644
--- a/api/rs_object_types.spec
+++ b/api/rs_object_types.spec
@@ -102,7 +102,7 @@
version: 14
enum:
value: RS_ALLOCATION_USAGE_SCRIPT = 0x0001, "Allocation is bound to and accessed by scripts."
-value: RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE = 0x0002, "Deprecated."
+value: RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE = 0x0002, "Allocation is used as a texture source."
value: RS_ALLOCATION_USAGE_GRAPHICS_VERTEX = 0x0004, "Deprecated."
value: RS_ALLOCATION_USAGE_GRAPHICS_CONSTANTS = 0x0008, "Deprecated."
value: RS_ALLOCATION_USAGE_GRAPHICS_RENDER_TARGET = 0x0010, "Deprecated."
diff --git a/api/rs_vector_math.spec b/api/rs_vector_math.spec
index ae6d4df..c1d464d 100644
--- a/api/rs_vector_math.spec
+++ b/api/rs_vector_math.spec
@@ -210,6 +210,17 @@
test: vector
end:
+function: native_distance
+version: 24
+attrib: const
+w: 1, 2, 3, 4
+t: f16
+ret: #2
+arg: #2#1 left_vector
+arg: #2#1 right_vector
+test: vector
+end:
+
function: native_length
version: 21
attrib: const
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index b8b4838..9f9c429 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -45,6 +45,8 @@
using namespace android;
using namespace android::renderscript;
+#define REDUCE_NEW_ALOGV(...) /* ALOGV(__VA_ARGS__) */
+
static pthread_key_t gThreadTLSKey = 0;
static uint32_t gThreadTLSKeyCount = 0;
static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
@@ -98,7 +100,7 @@
version_major = 0;
version_minor = 0;
- mInForEach = false;
+ mInKernel = false;
memset(&mWorkers, 0, sizeof(mWorkers));
memset(&mTlsStruct, 0, sizeof(mTlsStruct));
mExit = false;
@@ -239,6 +241,9 @@
ALOGE("pthread_setspecific %i", status);
}
+ mPageSize = sysconf(_SC_PAGE_SIZE);
+ REDUCE_NEW_ALOGV("page size = %ld", mPageSize);
+
GetCpuInfo();
int cpu = sysconf(_SC_NPROCESSORS_CONF);
@@ -435,7 +440,7 @@
}
}
-static void walk_1d(void *usr, uint32_t idx) {
+static void walk_1d_foreach(void *usr, uint32_t idx) {
MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
RsExpandKernelDriverInfo fep = mtls->fep;
fep.lid = idx;
@@ -458,6 +463,103 @@
}
}
+// The function format_bytes() is an auxiliary function to assist in logging.
+//
+// Bytes are read from an input (inBuf) and written (as pairs of hex digits)
+// to an output (outBuf).
+//
+// Output format:
+// - starts with ": "
+// - each input byte is translated to a pair of hex digits
+// - bytes are separated by "." except that every fourth separator is "|"
+// - if the input is sufficiently long, the output is truncated and terminated with "..."
+//
+// Arguments:
+// - outBuf -- Pointer to buffer of type "FormatBuf" into which output is written
+// - inBuf -- Pointer to bytes which are to be formatted into outBuf
+// - inBytes -- Number of bytes in inBuf
+//
+// Constant:
+// - kFormatInBytesMax -- Only min(kFormatInBytesMax, inBytes) bytes will be read
+// from inBuf
+//
+// Return value:
+// - pointer (const char *) to output (which is part of outBuf)
+//
+static const int kFormatInBytesMax = 16;
+// ": " + 2 digits per byte + 1 separator between bytes + "..." + null
+typedef char FormatBuf[2 + kFormatInBytesMax*2 + (kFormatInBytesMax - 1) + 3 + 1];
+static const char *format_bytes(FormatBuf *outBuf, const uint8_t *inBuf, const int inBytes) {
+ strcpy(*outBuf, ": ");
+ int pos = 2;
+ const int lim = std::min(kFormatInBytesMax, inBytes);
+ for (int i = 0; i < lim; ++i) {
+ if (i) {
+ sprintf(*outBuf + pos, (i % 4 ? "." : "|"));
+ ++pos;
+ }
+ sprintf(*outBuf + pos, "%02x", inBuf[i]);
+ pos += 2;
+ }
+ if (kFormatInBytesMax < inBytes)
+ strcpy(*outBuf + pos, "...");
+ return *outBuf;
+}
+
+static void walk_1d_reduce_new(void *usr, uint32_t idx) {
+ const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+ RsExpandKernelDriverInfo redp = mtls->redp;
+
+ // find accumulator
+ uint8_t *&accumPtr = mtls->accumPtr[idx];
+ if (!accumPtr) {
+ uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
+ if (mtls->outFunc) {
+ accumPtr = mtls->accumAlloc + mtls->accumStride * accumIdx;
+ } else {
+ if (accumIdx == 0) {
+ accumPtr = mtls->redp.outPtr[0];
+ } else {
+ accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
+ }
+ }
+ REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u got accumCount %u and accumPtr %p",
+ mtls->accumFunc, idx, accumIdx, accumPtr);
+ // initialize accumulator
+ if (mtls->initFunc) {
+ mtls->initFunc(accumPtr);
+ } else {
+ memset(accumPtr, 0, mtls->accumSize);
+ }
+ }
+
+ // accumulate
+ const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+ while (1) {
+ uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+ uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
+ uint32_t xEnd = xStart + mtls->mSliceSize;
+
+ xEnd = rsMin(xEnd, mtls->end.x);
+
+ if (xEnd <= xStart) {
+ return;
+ }
+
+ RedpPtrSetup(mtls, &redp, xStart, 0, 0);
+ fn(&redp, xStart, xEnd, accumPtr);
+
+ FormatBuf fmt;
+ if (mtls->logReduceAccum) {
+ format_bytes(&fmt, accumPtr, mtls->accumSize);
+ } else {
+ fmt[0] = 0;
+ }
+ REDUCE_NEW_ALOGV("walk_1d_reduce_new(%p): idx = %u [%u, %u)%s",
+ mtls->accumFunc, idx, xStart, xEnd, fmt);
+ }
+}
+
// Launch a simple reduce-style kernel.
// Inputs:
// ain: The allocation that contains the input
@@ -486,6 +588,25 @@
uint32_t inLen,
Allocation * aout,
MTLaunchStructReduceNew *mtls) {
+ mtls->logReduceAccum = mRSC->props.mLogReduceAccum;
+ if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
+ launchReduceNewParallel(ains, inLen, aout, mtls);
+ } else {
+ launchReduceNewSerial(ains, inLen, aout, mtls);
+ }
+}
+
+// Launch a general reduce-style kernel, single-threaded.
+// Inputs:
+// ains[0..inLen-1]: Array of allocations that contain the inputs
+// aout: The allocation that will hold the output
+// mtls: Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNewSerial(const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ MTLaunchStructReduceNew *mtls) {
+ ALOGV("launchReduceNewSerial(%p)", mtls->accumFunc);
+
// In the presence of outconverter, we allocate temporary memory for
// the accumulator.
//
@@ -521,6 +642,112 @@
}
}
+// Launch a general reduce-style kernel, multi-threaded.
+// Inputs:
+// ains[0..inLen-1]: Array of allocations that contain the inputs
+// aout: The allocation that will hold the output
+// mtls: Holds launch parameters
+void RsdCpuReferenceImpl::launchReduceNewParallel(const Allocation ** ains,
+ uint32_t inLen,
+ Allocation * aout,
+ MTLaunchStructReduceNew *mtls) {
+ // For now, we don't know how to go parallel beyond 1D, or in the absence of a combiner.
+ if ((mtls->redp.dim.y > 1) || (mtls->redp.dim.z > 1) || !mtls->combFunc) {
+ launchReduceNewSerial(ains, inLen, aout, mtls);
+ return;
+ }
+
+ // Number of threads = "main thread" + number of other (worker) threads
+ const uint32_t numThreads = mWorkers.mCount + 1;
+
+ // In the absence of outconverter, we use the output allocation as
+ // an accumulator, and therefore need to allocate one fewer accumulator.
+ const uint32_t numAllocAccum = numThreads - (mtls->outFunc == nullptr);
+
+ // If mDebugReduceSplitAccum, then we want each accumulator to start
+ // on a page boundary. (TODO: Would some unit smaller than a page
+ // be sufficient to avoid false sharing?)
+ if (mRSC->props.mDebugReduceSplitAccum) {
+ // Round up accumulator size to an integral number of pages
+ mtls->accumStride =
+ (unsigned(mtls->accumSize) + unsigned(mPageSize)-1) &
+ ~(unsigned(mPageSize)-1);
+ // Each accumulator gets its own page. Alternatively, if we just
+ // wanted to make sure no two accumulators are on the same page,
+ // we could instead do
+ // allocSize = mtls->accumStride * (numAllocation - 1) + mtls->accumSize
+ const size_t allocSize = mtls->accumStride * numAllocAccum;
+ mtls->accumAlloc = static_cast<uint8_t *>(memalign(mPageSize, allocSize));
+ } else {
+ mtls->accumStride = mtls->accumSize;
+ mtls->accumAlloc = static_cast<uint8_t *>(malloc(mtls->accumStride * numAllocAccum));
+ }
+
+ const size_t accumPtrArrayBytes = sizeof(uint8_t *) * numThreads;
+ mtls->accumPtr = static_cast<uint8_t **>(malloc(accumPtrArrayBytes));
+ memset(mtls->accumPtr, 0, accumPtrArrayBytes);
+
+ mtls->accumCount = 0;
+
+ rsAssert(!mInKernel);
+ mInKernel = true;
+ mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
+ ALOGV("launchReduceNewParallel(%p): %u threads, accumAlloc = %p",
+ mtls->accumFunc, numThreads, mtls->accumAlloc);
+ launchThreads(walk_1d_reduce_new, mtls);
+ mInKernel = false;
+
+ // Combine accumulators and identify final accumulator
+ uint8_t *finalAccumPtr = (mtls->outFunc ? nullptr : mtls->redp.outPtr[0]);
+ // Loop over accumulators, combining into finalAccumPtr. If finalAccumPtr
+ // is null, then the first accumulator I find becomes finalAccumPtr.
+ for (unsigned idx = 0; idx < mtls->accumCount; ++idx) {
+ uint8_t *const thisAccumPtr = mtls->accumPtr[idx];
+ if (finalAccumPtr) {
+ if (finalAccumPtr != thisAccumPtr) {
+ if (mtls->combFunc) {
+ if (mtls->logReduceAccum) {
+ FormatBuf fmt;
+ REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): accumulating into%s",
+ mtls->accumFunc,
+ format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+ REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): accumulator[%d]%s",
+ mtls->accumFunc, idx,
+ format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
+ }
+ mtls->combFunc(finalAccumPtr, thisAccumPtr);
+ } else {
+ rsAssert(!"expected combiner");
+ }
+ }
+ } else {
+ finalAccumPtr = thisAccumPtr;
+ }
+ }
+ rsAssert(finalAccumPtr != nullptr);
+ if (mtls->logReduceAccum) {
+ FormatBuf fmt;
+ REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final accumulator%s",
+ mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+ }
+
+ // Outconvert
+ if (mtls->outFunc) {
+ mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
+ if (mtls->logReduceAccum) {
+ FormatBuf fmt;
+ REDUCE_NEW_ALOGV("launchReduceNewParallel(%p): final outconverted result%s",
+ mtls->accumFunc,
+ format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
+ }
+ }
+
+ // Clean up
+ free(mtls->accumPtr);
+ free(mtls->accumAlloc);
+}
+
+
void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
uint32_t inLen,
Allocation* aout,
@@ -537,9 +764,9 @@
(mtls->start.array[2] != mtls->end.array[2]) ||
(mtls->start.array[3] != mtls->end.array[3]);
- if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+ if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
const size_t targetByteChunk = 16 * 1024;
- mInForEach = true;
+ mInKernel = true; // NOTE: The guard immediately above ensures this was !mInKernel
if (outerDims) {
// No fancy logic for chunk size
@@ -588,9 +815,9 @@
mtls->mSliceSize = 1;
}
- launchThreads(walk_1d, mtls);
+ launchThreads(walk_1d_foreach, mtls);
}
- mInForEach = false;
+ mInKernel = false;
} else {
ForEachFunc_t fn = mtls->kernel;
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 939b7ae..c2a0864 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -34,6 +34,7 @@
// Function types found in RenderScript code
typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceNewCombinerFunc_t)(uint8_t *accum, const uint8_t *other);
typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
@@ -44,6 +45,7 @@
struct ReduceNewDescription {
ReduceNewAccumulatorFunc_t accumFunc; // expanded accumulator function
ReduceNewInitializerFunc_t initFunc; // user initializer function
+ ReduceNewCombinerFunc_t combFunc; // user combiner function
ReduceNewOutConverterFunc_t outFunc; // user outconverter function
size_t accumSize; // accumulator datum size, in bytes
};
@@ -73,7 +75,8 @@
RsLaunchDimensions start;
RsLaunchDimensions end;
// Points to MTLaunchStructForEach::fep::dim or
- // MTLaunchStructReduce::inputDim.
+ // MTLaunchStructReduce::inputDim or
+ // MTLaunchStructReduceNew::redp::dim.
RsLaunchDimensions *dimPtr;
};
@@ -101,9 +104,51 @@
ReduceNewAccumulatorFunc_t accumFunc;
ReduceNewInitializerFunc_t initFunc;
+ ReduceNewCombinerFunc_t combFunc;
ReduceNewOutConverterFunc_t outFunc;
size_t accumSize; // accumulator datum size in bytes
+
+ size_t accumStride; // stride between accumulators in accumAlloc (below)
+
+ // These fields are used for managing accumulator data items in a
+ // multithreaded execution.
+ //
+ // Let the number of threads be N.
+ // Let Outc be true iff there is an outconverter.
+ //
+ // accumAlloc is a pointer to a single allocation of (N - !Outc)
+ // accumulators. (If there is no outconverter, then the output
+ // allocation acts as an accumulator.) It is created at kernel
+ // launch time. Within that allocation, the distance between the
+ // start of adjacent accumulators is accumStride bytes -- this
+ // might be the same as accumSize, or it might be larger, if we
+ // are attempting to avoid false sharing.
+ //
+ // accumCount is an atomic counter of how many accumulators have
+ // been grabbed by threads. It is initialized to zero at kernel
+ // launch time. See accumPtr for further description.
+ //
+ // accumPtr is pointer to an array of N pointers to accumulators.
+ // The array is created at kernel launch time, and each element is
+ // initialized to nullptr. When a particular thread goes to work,
+ // that thread obtains its accumulator from its entry in this
+ // array. If the entry is nullptr, that thread needs to obtain an
+ // accumulator, and initialize its entry in the array accordingly.
+ // It does so via atomic access (fetch-and-add) to accumCount.
+ // - If Outc, then the fetched value is used as an index into
+ // accumAlloc.
+ // - If !Outc, then
+ // - If the fetched value is zero, then this thread gets the
+ // output allocation for its accumulator.
+ // - If the fetched value is nonzero, then (fetched value - 1)
+ // is used as an index into accumAlloc.
+ uint8_t *accumAlloc;
+ uint8_t **accumPtr;
+ uint32_t accumCount;
+
+ // Logging control
+ bool logReduceAccum;
};
class RsdCpuReferenceImpl : public RsdCpuReference {
@@ -161,7 +206,7 @@
virtual const char *getBccPluginName() const {
return mBccPluginName.string();
}
- bool getInForEach() override { return mInForEach; }
+ bool getInKernel() override { return mInKernel; }
// Set to true if we should embed global variable information in the code.
void setEmbedGlobalInfo(bool v) override {
@@ -190,7 +235,7 @@
uint32_t version_major;
uint32_t version_minor;
//bool mHasGraphics;
- bool mInForEach;
+ bool mInKernel; // Is a parallel kernel execution underway?
struct Workers {
volatile int mRunningCount;
@@ -222,6 +267,14 @@
// when potentially embedding information about globals.
// Defaults to true.
bool mEmbedGlobalInfoSkipConstant;
+
+ long mPageSize;
+
+ // Launch a general reduce kernel
+ void launchReduceNewSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+ MTLaunchStructReduceNew *mtls);
+ void launchReduceNewParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+ MTLaunchStructReduceNew *mtls);
};
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index 5dd31ee..1a0e70e 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -538,8 +538,8 @@
goto error;
}
- // The current implementation does not use the signature,
- // reduce name, or combiner.
+ // The current implementation does not use the signature
+ // or reduce name.
reduceNewDescriptions[i].accumSize = tmpSize;
@@ -565,6 +565,19 @@
goto error;
}
+ // Process the (optional) combiner.
+ if (strcmp(tmpNameCombiner, kNoName)) {
+ // Lookup the original user-written combiner.
+ if (!(reduceNewDescriptions[i].combFunc =
+ (ReduceNewCombinerFunc_t) dlsym(sharedObj, tmpNameCombiner))) {
+ ALOGE("Failed to find combiner function address for %s(): %s",
+ tmpNameCombiner, dlerror());
+ goto error;
+ }
+ } else {
+ reduceNewDescriptions[i].combFunc = nullptr;
+ }
+
// Process the (optional) outconverter.
if (strcmp(tmpNameOutConverter, kNoName)) {
// Lookup the original user-written outconverter.
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a5fc96b..25dab00 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -646,9 +646,9 @@
mtls->rs = mCtx;
- // Currently not threaded.
- mtls->isThreadable = false;
- mtls->mSliceNum = -1;
+ mtls->mSliceNum = 0;
+ mtls->mSliceSize = 1;
+ mtls->isThreadable = mIsThreadable;
// Set up output,
mtls->redp.outLen = 1;
@@ -843,6 +843,7 @@
const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
mtls->accumFunc = desc->accumFunc;
mtls->initFunc = desc->initFunc; // might legally be nullptr
+ mtls->combFunc = desc->combFunc; // might legally be nullptr
mtls->outFunc = desc->outFunc; // might legally be nullptr
mtls->accumSize = desc->accumSize;
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 49a999d..e226b93 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -131,7 +131,7 @@
uint32_t flags) = 0;
virtual CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) = 0;
virtual void* createScriptGroup(const ScriptGroupBase *sg) = 0;
- virtual bool getInForEach() = 0;
+ virtual bool getInKernel() = 0; // Is a parallel kernel execution underway?
// Set to true if we should embed global variable information in the code.
virtual void setEmbedGlobalInfo(bool v) = 0;
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index b4958e2..9fea491 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -136,7 +136,7 @@
RsdHal *dc = (RsdHal *)rsc->mHal.drv;
RsdCpuReference *impl = (RsdCpuReference *) dc->mCpuRef;
- if (impl->getInForEach()) {
+ if (impl->getInKernel()) {
char buf[256];
snprintf(buf, sizeof(buf), "Error: Call to unsupported function %s "
"in kernel", funcName);
diff --git a/java/tests/HealingBrush/src/rs/example/android/com/healingbrush/DrawView.java b/java/tests/HealingBrush/src/rs/example/android/com/healingbrush/DrawView.java
index 2995369..f091fc4 100644
--- a/java/tests/HealingBrush/src/rs/example/android/com/healingbrush/DrawView.java
+++ b/java/tests/HealingBrush/src/rs/example/android/com/healingbrush/DrawView.java
@@ -42,6 +42,8 @@
Paint mPaint1;
Paint mPaint2;
private boolean mDone;
+ private boolean mUseDefaultRegion = true;
+
ArrayList<Drawable> drawList = new ArrayList<Drawable>();
private void setup(Context context) {
@@ -94,8 +96,16 @@
}
public Region getRegion(Bitmap img) {
- Region ret = new Region(Arrays.copyOf(path, len), img);
-
+ Region ret;
+ if (mUseDefaultRegion) {
+ float[] defaultPath = {10.0f, 110.0f,
+ 110.0f, 10.0f,
+ 210.0f, 110.0f,
+ 110.0f, 210.0f};
+ ret = new Region(Arrays.copyOf(defaultPath, defaultPath.length), img);
+ } else {
+ ret = new Region(Arrays.copyOf(path, len), img);
+ }
invalidate();
return ret;
}
@@ -108,6 +118,7 @@
mPoints_backup.addPath(mPoints);
mPoints.reset();
mPoints.moveTo(imgPoint[0], imgPoint[1]);
+ mUseDefaultRegion = false;
}
public void undo() {
diff --git a/java/tests/HealingBrush/src/rs/example/android/com/healingbrush/MainActivity.java b/java/tests/HealingBrush/src/rs/example/android/com/healingbrush/MainActivity.java
index 2f41ff7..be0a9ac 100644
--- a/java/tests/HealingBrush/src/rs/example/android/com/healingbrush/MainActivity.java
+++ b/java/tests/HealingBrush/src/rs/example/android/com/healingbrush/MainActivity.java
@@ -264,14 +264,16 @@
folder = Environment.getExternalStoragePublicDirectory(Environment.DIRECTORY_DOWNLOADS);
mImagePath = folder.getPath();
File[] files = folder.listFiles();
- Log.v(TAG, "files" + files.length);
- for (int i = 0; i < files.length; i++) {
- Log.v(TAG, "[" + i + "]=" + files[i].getAbsolutePath());
- if (files[i].getName().toLowerCase().endsWith(".jpg")) {
- mDisplayedImage = BitmapFactory.decodeFile(files[i].getAbsolutePath());
- mImagePath = files[i].getParentFile().getAbsolutePath();
- mImageName = files[i].getName();
- return;
+ if (files != null) {
+ Log.v(TAG, "files" + files.length);
+ for (int i = 0; i < files.length; i++) {
+ Log.v(TAG, "[" + i + "]=" + files[i].getAbsolutePath());
+ if (files[i].getName().toLowerCase().endsWith(".jpg")) {
+ mDisplayedImage = BitmapFactory.decodeFile(files[i].getAbsolutePath());
+ mImagePath = files[i].getParentFile().getAbsolutePath();
+ mImageName = files[i].getName();
+ return;
+ }
}
}
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
index 608de47..c1e9c40 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce.java
@@ -119,39 +119,6 @@
///////////////////////////////////////////////////////////////////
- private float dp(float[] input1, float[] input2) {
- _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
- float rslt = 0;
- for (int idx = 0; idx < input1.length; ++idx)
- rslt += input1[idx] * input2[idx];
- return rslt;
- }
-
- private boolean dp(RenderScript RS, ScriptC_reduce s) {
- final float[] input1 = createInputArrayFloat(100000, 2);
- final float[] input2 = createInputArrayFloat(100000, 3);
-
- final float javaRslt = dp(input1, input2);
- final float rsRslt = s.reduce_dp(input1, input2).get();
-
- // NOTE: Using a floating point equality check to test for
- // correctness -- as we do below -- is a bad idea. It's only
- // reliable if the Java and RenderScript implementation of dp
- // use the same algorithm. Equality could be broken by
- // different optimizations between the two, or running the
- // RenderScript algorithm multithreaded, or running the
- // RenderScript algorithm on a GPU rather than the CPU.
- //
- // Should we be checking instead that the results are
- // "sufficiently close"? Cooking the input set to try to
- // ensure a deterministic result? Changing to integers
- // instead?
- return result("dp", javaRslt, rsRslt);
- }
-
- ///////////////////////////////////////////////////////////////////
-
private Int2 findMinAndMax(float[] input) {
float minVal = Float.POSITIVE_INFINITY;
int minIdx = -1;
@@ -322,7 +289,6 @@
boolean pass = true;
pass &= addint1D(pRS, s);
pass &= addint2D(pRS, s);
- pass &= dp(pRS, s);
pass &= findMinAndMax(pRS, s);
pass &= fz(pRS, s);
pass &= fz2(pRS, s);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
index 84d2c50..b998f51 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/UT_reduce_backward.java
@@ -119,39 +119,6 @@
///////////////////////////////////////////////////////////////////
- private float dp(float[] input1, float[] input2) {
- _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
- float rslt = 0;
- for (int idx = 0; idx < input1.length; ++idx)
- rslt += input1[idx] * input2[idx];
- return rslt;
- }
-
- private boolean dp(RenderScript RS, ScriptC_reduce_backward s) {
- final float[] input1 = createInputArrayFloat(100000, 2);
- final float[] input2 = createInputArrayFloat(100000, 3);
-
- final float javaRslt = dp(input1, input2);
- final float rsRslt = s.reduce_dp(input1, input2).get();
-
- // NOTE: Using a floating point equality check to test for
- // correctness -- as we do below -- is a bad idea. It's only
- // reliable if the Java and RenderScript implementation of dp
- // use the same algorithm. Equality could be broken by
- // different optimizations between the two, or running the
- // RenderScript algorithm multithreaded, or running the
- // RenderScript algorithm on a GPU rather than the CPU.
- //
- // Should we be checking instead that the results are
- // "sufficiently close"? Cooking the input set to try to
- // ensure a deterministic result? Changing to integers
- // instead?
- return result("dp", javaRslt, rsRslt);
- }
-
- ///////////////////////////////////////////////////////////////////
-
private Int2 findMinAndMax(float[] input) {
float minVal = Float.POSITIVE_INFINITY;
int minIdx = -1;
@@ -322,7 +289,6 @@
boolean pass = true;
pass &= addint1D(pRS, s);
pass &= addint2D(pRS, s);
- pass &= dp(pRS, s);
pass &= findMinAndMax(pRS, s);
pass &= fz(pRS, s);
pass &= fz2(pRS, s);
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
index be09dfb..ec7be8b 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce.rs
@@ -16,18 +16,6 @@
/////////////////////////////////////////////////////////////////////////
-#pragma rs reduce(dp) \
- accumulator(dpAccum) combiner(dpSum)
-
-static void dpAccum(float *accum, float in1, float in2) {
- *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-/////////////////////////////////////////////////////////////////////////
-
#pragma rs reduce(findMinAndMax) \
initializer(fMMInit) accumulator(fMMAccumulator) \
combiner(fMMCombiner) outconverter(fMMOutConverter)
@@ -61,8 +49,10 @@
static void fMMCombiner(MinAndMax *accum,
const MinAndMax *val) {
- fMMAccumulator(accum, val->min.val, val->min.idx);
- fMMAccumulator(accum, val->max.val, val->max.idx);
+ if (val->min.val < accum->min.val)
+ accum->min = val->min;
+ if (val->max.val > accum->max.val)
+ accum->max = val->max;
}
static void fMMOutConverter(int2 *result,
diff --git a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
index 419e709..41252c8 100644
--- a/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
+++ b/java/tests/RSTest_CompatLib/src/com/android/rs/test/reduce_backward.rs
@@ -15,18 +15,6 @@
/////////////////////////////////////////////////////////////////////////
-static void dpAccum(float *accum, float in1, float in2) {
- *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-#pragma rs reduce(dp) \
- accumulator(dpAccum) combiner(dpSum)
-
-/////////////////////////////////////////////////////////////////////////
-
typedef struct {
float val;
int idx;
@@ -56,8 +44,10 @@
static void fMMCombiner(MinAndMax *accum,
const MinAndMax *val) {
- fMMAccumulator(accum, val->min.val, val->min.idx);
- fMMAccumulator(accum, val->max.val, val->max.idx);
+ if (val->min.val < accum->min.val)
+ accum->min = val->min;
+ if (val->max.val > accum->max.val)
+ accum->max = val->max;
}
static void fMMOutConverter(int2 *result,
diff --git a/java/tests/RsTest/AndroidManifest.xml b/java/tests/RsTest/AndroidManifest.xml
index b660398..31da896 100644
--- a/java/tests/RsTest/AndroidManifest.xml
+++ b/java/tests/RsTest/AndroidManifest.xml
@@ -2,6 +2,7 @@
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
package="com.android.rs.test">
<application
+ android:largeHeap="true"
android:label="_RS_Test"
android:icon="@drawable/test_pattern">
<activity android:name="RSTest"
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
index a244646..0769259 100644
--- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce.java
@@ -27,6 +27,7 @@
import android.renderscript.*;
import android.util.Log;
import java.lang.Float;
+import java.util.Arrays;
import java.util.Random;
public class UT_reduce extends UnitTest {
@@ -36,6 +37,81 @@
super(rstc, "reduce", ctx);
}
+ private static class timing {
+ timing(long myJavaStart, long myJavaEnd, long myRsStart,
+ long myCopyStart, long myKernelStart, long myRsEnd,
+ Allocation... myInputs) {
+ javaStart = myJavaStart;
+ javaEnd = myJavaEnd;
+ rsStart = myRsStart;
+ copyStart = myCopyStart;
+ kernelStart = myKernelStart;
+ rsEnd = myRsEnd;
+
+ inputBytes = 0;
+ for (Allocation input : myInputs)
+ inputBytes += input.getBytesSize();
+
+ inputCells = (myInputs.length > 0) ? myInputs[0].getType().getCount() : 0;
+ }
+
+ timing(long myInputCells) {
+ inputCells = myInputCells;
+ }
+
+ private long javaStart = -1;
+ private long javaEnd = -1;
+ private long rsStart = -1;
+ private long copyStart = -1;
+ private long kernelStart = -1;
+ private long rsEnd = -1;
+ private long inputBytes = -1;
+ private long inputCells = -1;
+
+ public long javaTime() { return javaEnd - javaStart; }
+ public long rsTime() { return rsEnd - rsStart; }
+ public long kernelTime() { return rsEnd - kernelStart; }
+ public long overheadTime() { return kernelStart - rsStart; }
+ public long allocationTime() { return copyStart - rsStart; }
+ public long copyTime() { return kernelStart - copyStart; }
+
+ public static String string(long myJavaStart, long myJavaEnd, long myRsStart,
+ long myCopyStart, long myKernelStart, long myRsEnd,
+ Allocation... myInputs) {
+ return (new timing(myJavaStart, myJavaEnd, myRsStart,
+ myCopyStart, myKernelStart, myRsEnd, myInputs)).string();
+ }
+
+ public static String string(long myInputCells) {
+ return (new timing(myInputCells)).string();
+ }
+
+ public String string() {
+ String result;
+ if (javaStart >= 0) {
+ result = "(java " + javaTime() + "ms, rs " + rsTime() + "ms = overhead " +
+ overheadTime() + "ms (alloc " + allocationTime() + "ms + copy " +
+ copyTime() + "ms) + kernel+get() " + kernelTime() + "ms)";
+ if (inputCells > 0)
+ result += " ";
+ } else {
+ result = "";
+ }
+ if (inputCells > 0) {
+ result += "(" + fmt.format(inputCells) + " cells";
+ if (inputBytes > 0)
+ result += ", " + fmt.format(inputBytes) + " bytes";
+ result += ")";
+ }
+ return result;
+ }
+
+ private static java.text.DecimalFormat fmt;
+ static {
+ fmt = new java.text.DecimalFormat("###,###");
+ }
+ };
+
private byte[] createInputArrayByte(int len, int seed) {
byte[] array = new byte[len];
(new Random(seed)).nextBytes(array);
@@ -66,21 +142,60 @@
return array;
}
- private <T extends Number> boolean result(String testName, T javaRslt, T rsRslt) {
+ private <T extends Number> boolean result(String testName, final timing t,
+ T javaRslt, T rsRslt) {
final boolean success = javaRslt.equals(rsRslt);
- Log.i(TAG,
- testName + ": java " + javaRslt + ", rs " + rsRslt + ": " +
- (success ? "PASSED" : "FAILED"));
+ String status = (success ? "PASSED" : "FAILED");
+ if (success && (t != null))
+ status += " " + t.string();
+ Log.i(TAG, testName + ": java " + javaRslt + ", rs " + rsRslt + ": " + status);
return success;
}
- private boolean result(String testName, Int2 javaRslt, Int2 rsRslt) {
+ private boolean result(String testName, final timing t,
+ final long[] javaRslt, final long[] rsRslt) {
+ if (javaRslt.length != rsRslt.length) {
+ Log.i(TAG, testName + ": java length " + javaRslt.length +
+ ", rs length " + rsRslt.length + ": FAILED");
+ return false;
+ }
+ for (int i = 0; i < javaRslt.length; ++i) {
+ if (javaRslt[i] != rsRslt[i]) {
+ Log.i(TAG, testName + "[" + i + "]: java " + javaRslt[i] +
+ ", rs " + rsRslt[i] + ": FAILED");
+ return false;
+ }
+ }
+ String status = "PASSED";
+ if (t != null)
+ status += " " + t.string();
+ Log.i(TAG, testName + ": " + status);
+ return true;
+ }
+
+ private boolean result(String testName, final timing t, Int2 javaRslt, Int2 rsRslt) {
final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success && (t != null))
+ status += " " + t.string();
Log.i(TAG,
testName +
": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
- ": " + (success ? "PASSED" : "FAILED"));
+ ": " + status);
+ return success;
+ }
+
+ private boolean result(String testName, final timing t, Float2 javaRslt, Float2 rsRslt) {
+ final boolean success = (javaRslt.x == rsRslt.x) && (javaRslt.y == rsRslt.y);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success && (t != null))
+ status += " " + t.string();
+ Log.i(TAG,
+ testName +
+ ": java (" + javaRslt.x + ", " + javaRslt.y + ")" +
+ ", rs (" + rsRslt.x + ", " + rsRslt.y + ")" +
+ ": " + status);
return success;
}
@@ -93,61 +208,68 @@
return rslt;
}
- private boolean addint1D(RenderScript RS, ScriptC_reduce s) {
- final int[] input = createInputArrayInt(100000, 0, 1 << 13);
+ private boolean addint1D_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final int[] input = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]);
final int javaRslt = addint(input);
final int rsRslt = s.reduce_addint(input).get();
- return result("addint1D", javaRslt, rsRslt);
+ return result("addint1D_array", new timing(size[0]), javaRslt, rsRslt);
}
- private boolean addint2D(RenderScript RS, ScriptC_reduce s) {
- final int dimX = 450, dimY = 225;
+ private boolean addint1D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final int[] inputArray = createInputArrayInt(size[0], seed, Integer.MAX_VALUE / size[0]);
- final int[] inputArray = createInputArrayInt(dimX * dimY, 1, 1 << 13);
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final int javaRslt = addint(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocation.copyFrom(inputArray);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final int rsRslt = s.reduce_addint(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ return result("addint1D",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+ javaRslt, rsRslt);
+ }
+
+ private boolean addint2D(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final int dimX = size[0];
+ final int dimY = size[1];
+
+ final int[] inputArray = createInputArrayInt(dimX * dimY, seed, Integer.MAX_VALUE / (dimX * dimY));
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final int javaRslt = addint(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
typeBuilder.setX(dimX).setY(dimY);
Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
- final int javaRslt = addint(inputArray);
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
final int rsRslt = s.reduce_addint(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
- return result("addint2D", javaRslt, rsRslt);
- }
-
- ///////////////////////////////////////////////////////////////////
-
- private float dp(float[] input1, float[] input2) {
- _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
- float rslt = 0;
- for (int idx = 0; idx < input1.length; ++idx)
- rslt += input1[idx] * input2[idx];
- return rslt;
- }
-
- private boolean dp(RenderScript RS, ScriptC_reduce s) {
- final float[] input1 = createInputArrayFloat(100000, 2);
- final float[] input2 = createInputArrayFloat(100000, 3);
-
- final float javaRslt = dp(input1, input2);
- final float rsRslt = s.reduce_dp(input1, input2).get();
-
- // NOTE: Using a floating point equality check to test for
- // correctness -- as we do below -- is a bad idea. It's only
- // reliable if the Java and RenderScript implementation of dp
- // use the same algorithm. Equality could be broken by
- // different optimizations between the two, or running the
- // RenderScript algorithm multithreaded, or running the
- // RenderScript algorithm on a GPU rather than the CPU.
- //
- // Should we be checking instead that the results are
- // "sufficiently close"? Cooking the input set to try to
- // ensure a deterministic result? Changing to integers
- // instead?
- return result("dp", javaRslt, rsRslt);
+ return result("addint2D",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+ javaRslt, rsRslt);
}
///////////////////////////////////////////////////////////////////
@@ -172,79 +294,195 @@
return new Int2(minIdx, maxIdx);
}
- private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s) {
- final float[] input = createInputArrayFloat(100000, 4);
+ private boolean findMinAndMax_array(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final float[] input = createInputArrayFloat(size[0], seed);
final Int2 javaRslt = findMinAndMax(input);
final Int2 rsRslt = s.reduce_findMinAndMax(input).get();
- return result("findMinAndMax", javaRslt, rsRslt);
+ // Note that the Java and RenderScript algorithms are not
+ // guaranteed to find the same cells -- but they should
+ // find cells of the same value.
+ final Float2 javaVal = new Float2(input[javaRslt.x], input[javaRslt.y]);
+ final Float2 rsVal = new Float2(input[rsRslt.x], input[rsRslt.y]);
+
+ return result("findMinAndMax_array", new timing(size[0]), javaVal, rsVal);
+ }
+
+ private boolean findMinAndMax(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final float[] inputArray = createInputArrayFloat(size[0], seed);
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final Int2 javaRslt = findMinAndMax(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocation = Allocation.createSized(RS, Element.F32(RS), inputArray.length);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocation.copyFrom(inputArray);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final Int2 rsRslt = s.reduce_findMinAndMax(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ // Note that the Java and RenderScript algorithms are not
+ // guaranteed to find the same cells -- but they should
+ // find cells of the same value.
+ final Float2 javaVal = new Float2(inputArray[javaRslt.x], inputArray[javaRslt.y]);
+ final Float2 rsVal = new Float2(inputArray[rsRslt.x], inputArray[rsRslt.y]);
+
+ return result("findMinAndMax",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+ javaVal, rsVal);
}
///////////////////////////////////////////////////////////////////
- private boolean fz(RenderScript RS, ScriptC_reduce s) {
- final int inputLen = 100000;
- int[] input = createInputArrayInt(inputLen, 5);
+ private int fz(final int[] input) {
+ for (int i = 0; i < input.length; ++i)
+ if (input[i] == 0)
+ return i;
+ return -1;
+ }
+
+ private boolean fz_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final int inputLen = size[0];
+ int[] input = createInputArrayInt(inputLen, seed+0);
// just in case we got unlucky
- input[(new Random(6)).nextInt(inputLen)] = 0;
+ input[(new Random(seed+1)).nextInt(inputLen)] = 0;
final int rsRslt = s.reduce_fz(input).get();
final boolean success = (input[rsRslt] == 0);
Log.i(TAG,
- "fz: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
- (success ? "PASSED" : "FAILED"));
+ "fz_array: input[" + rsRslt + "] == " + input[rsRslt] + ": " +
+ (success ? "PASSED " + timing.string(size[0]) : "FAILED"));
+ return success;
+ }
+
+ private boolean fz(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final int inputLen = size[0];
+ int[] inputArray = createInputArrayInt(inputLen, seed+0);
+ // just in case we got unlucky
+ inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final int javaRslt = fz(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocation = Allocation.createSized(RS, Element.I32(RS), inputArray.length);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocation.copyFrom(inputArray);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final int rsRslt = s.reduce_fz(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ final boolean success = (inputArray[rsRslt] == 0);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success)
+ status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
+ Log.i(TAG,
+ "fz: java input[" + javaRslt + "] == " + inputArray[javaRslt] +
+ ", rs input[" + rsRslt + "] == " + inputArray[javaRslt] + ": " + status);
return success;
}
///////////////////////////////////////////////////////////////////
- private boolean fz2(RenderScript RS, ScriptC_reduce s) {
- final int dimX = 225, dimY = 450;
+ private boolean fz2(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final int dimX = size[0], dimY = size[1];
final int inputLen = dimX * dimY;
- int[] inputArray = createInputArrayInt(inputLen, 7);
+ int[] inputArray = createInputArrayInt(inputLen, seed+0);
// just in case we got unlucky
- inputArray[(new Random(8)).nextInt(inputLen)] = 0;
+ inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final int javaRsltLinear = fz(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final Int2 javaRslt = new Int2(javaRsltLinear % dimX, javaRsltLinear / dimX);
+ final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y];
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
typeBuilder.setX(dimX).setY(dimY);
Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
inputAllocation.copy2DRangeFrom(0, 0, dimX, dimY, inputArray);
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
final Int2 rsRslt = s.reduce_fz2(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
- final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
- final boolean success = (cellVal == 0);
+ final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y];
+ final boolean success = (rsCellVal == 0);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success)
+ status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
Log.i(TAG,
- "fz2: input[" + rsRslt.x + ", " + rsRslt.y + "] == " + cellVal + ": " +
- (success ? "PASSED" : "FAILED"));
+ "fz2: java input[" + javaRslt.x + ", " + javaRslt.y + "] == " + javaCellVal +
+ ", rs input[" + rsRslt.x + ", " + rsRslt.y + "] == " + rsCellVal + ": " + status);
return success;
}
///////////////////////////////////////////////////////////////////
- private boolean fz3(RenderScript RS, ScriptC_reduce s) {
- final int dimX = 59, dimY = 48, dimZ = 37;
+ private boolean fz3(RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ final int dimX = size[0], dimY = size[1], dimZ = size[2];
final int inputLen = dimX * dimY * dimZ;
- int[] inputArray = createInputArrayInt(inputLen, 9);
+ int[] inputArray = createInputArrayInt(inputLen, seed+0);
// just in case we got unlucky
- inputArray[(new Random(10)).nextInt(inputLen)] = 0;
+ inputArray[(new Random(seed+1)).nextInt(inputLen)] = 0;
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final int javaRsltLinear = fz(inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final Int3 javaRslt = new Int3(
+ javaRsltLinear % dimX,
+ (javaRsltLinear / dimX) % dimY,
+ javaRsltLinear / (dimX * dimY));
+ final int javaCellVal = inputArray[javaRslt.x + dimX * javaRslt.y + dimX * dimY * javaRslt.z];
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
Type.Builder typeBuilder = new Type.Builder(RS, Element.I32(RS));
typeBuilder.setX(dimX).setY(dimY).setZ(dimZ);
Allocation inputAllocation = Allocation.createTyped(RS, typeBuilder.create());
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
inputAllocation.copy3DRangeFrom(0, 0, 0, dimX, dimY, dimZ, inputArray);
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
final Int3 rsRslt = s.reduce_fz3(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
- final int cellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
- final boolean success = (cellVal == 0);
+ final int rsCellVal = inputArray[rsRslt.x + dimX * rsRslt.y + dimX * dimY * rsRslt.z];
+ final boolean success = (rsCellVal == 0);
+ String status = (success ? "PASSED" : "FAILED");
+ if (success)
+ status += " " + timing.string(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation);
Log.i(TAG,
- "fz3: input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + cellVal + ": " +
- (success ? "PASSED" : "FAILED"));
+ "fz3: java input[" + javaRslt.x + ", " + javaRslt.y + ", " + javaRslt.z + "] == " + javaCellVal +
+ ", rs input[" + rsRslt.x + ", " + rsRslt.y + ", " + rsRslt.z + "] == " + rsCellVal + ": " + status);
return success;
}
@@ -271,24 +509,43 @@
return outputArray;
}
- private boolean histogram(RenderScript RS, ScriptC_reduce s) {
- final byte[] inputArray = createInputArrayByte(100000, 11);
+ private boolean histogram_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final byte[] inputArray = createInputArrayByte(size[0], seed);
final long[] javaRslt = histogram(RS, inputArray);
_RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
final long[] rsRslt = s.reduce_histogram(inputArray).get();
_RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
- for (int i = 0; i < histogramBucketCount; ++i) {
- if (javaRslt[i] != rsRslt[i]) {
- Log.i(TAG,
- "histogram[" + i + "]: java " + javaRslt[i] + ", rs " + rsRslt[i] + ": FAILED");
- return false;
- }
- }
+ return result("histogram_array", new timing(size[0]), javaRslt, rsRslt);
+ }
- Log.i(TAG, "histogram: PASSED");
- return true;
+ private boolean histogram(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final byte[] inputArray = createInputArrayByte(size[0], seed);
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final long[] javaRslt = histogram(RS, inputArray);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+ _RS_ASSERT("javaRslt unexpected length: " + javaRslt.length, javaRslt.length == histogramBucketCount);
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocation = Allocation.createSized(RS, Element.U8(RS), inputArray.length);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocation.copyFrom(inputArray);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final long[] rsRslt = s.reduce_histogram(inputAllocation).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+ _RS_ASSERT("rsRslt unexpected length: " + rsRslt.length, rsRslt.length == histogramBucketCount);
+
+ // NOTE: The "java time" is actually for the RenderScript histogram intrinsic
+ return result("histogram",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart,
+ copyTimeStart, kernelTimeStart, rsTimeEnd, inputAllocation),
+ javaRslt, rsRslt);
}
//-----------------------------------------------------------------
@@ -302,17 +559,250 @@
return new Int2(modeIdx, (int)hsg[modeIdx]);
}
- private boolean mode(RenderScript RS, ScriptC_reduce s) {
- final byte[] inputArray = createInputArrayByte(100000, 12);
+ private boolean mode_array(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final byte[] inputArray = createInputArrayByte(size[0], seed);
final Int2 javaRslt = mode(RS, inputArray);
final Int2 rsRslt = s.reduce_mode(inputArray).get();
- return result("mode", javaRslt, rsRslt);
+ return result("mode", new timing(size[0]), javaRslt, rsRslt);
}
///////////////////////////////////////////////////////////////////
+ private long sumgcd(final int in1[], final int in2[]) {
+ _RS_ASSERT("sumgcd input length mismatch", in1.length == in2.length);
+
+ long sum = 0;
+ for (int i = 0; i < in1.length; ++i) {
+ int a = in1[i], b = in2[i];
+
+ while (b != 0) {
+ final int aNew = b;
+ final int bNew = a % b;
+
+ a = aNew;
+ b = bNew;
+ }
+
+ sum += a;
+ }
+ return sum;
+ }
+
+ private boolean sumgcd(RenderScript RS, ScriptC_reduce s, int seed, int size[]) {
+ final int len = size[0];
+
+ final int[] inputArrayA = createInputArrayInt(len, seed+0);
+ final int[] inputArrayB = createInputArrayInt(len, seed+1);
+
+ final long javaTimeStart = java.lang.System.currentTimeMillis();
+ final long javaRslt = sumgcd(inputArrayA, inputArrayB);
+ final long javaTimeEnd = java.lang.System.currentTimeMillis();
+
+ final long rsTimeStart = java.lang.System.currentTimeMillis();
+
+ Allocation inputAllocationA = Allocation.createSized(RS, Element.I32(RS), len);
+ Allocation inputAllocationB = Allocation.createSized(RS, Element.I32(RS), len);
+
+ final long copyTimeStart = java.lang.System.currentTimeMillis();
+
+ inputAllocationA.copyFrom(inputArrayA);
+ inputAllocationB.copyFrom(inputArrayB);
+
+ final long kernelTimeStart = java.lang.System.currentTimeMillis();
+ final long rsRslt = s.reduce_sumgcd(inputAllocationA, inputAllocationB).get();
+ final long rsTimeEnd = java.lang.System.currentTimeMillis();
+
+ return result("sumgcd",
+ new timing(javaTimeStart, javaTimeEnd, rsTimeStart, copyTimeStart, kernelTimeStart, rsTimeEnd,
+ inputAllocationA, inputAllocationB),
+ javaRslt, rsRslt);
+ }
+
+ ///////////////////////////////////////////////////////////////////
+
+ public static final int maxSeedsPerTest = 10;
+
+ static interface Test {
+ // A test execution is characterized by two properties: A seed
+ // and a size.
+ //
+ // The seed is used for generating pseudorandom input data.
+ // Ideally, we use different seeds for different tests and for
+ // different executions of the same test at different sizes.
+ // A test with multiple blocks of input data (i.e., for a
+ // reduction with multiple inputs) may want multiple seeds; it
+ // may use the seeds seed..seed+maxSeedsPerTest-1.
+ //
+ // The size indicates the amount of input data. It is the number
+ // of cells in a particular dimension of the iteration space.
+ boolean run(RenderScript RS, ScriptC_reduce s, int seed, int[] size);
+ };
+
+ static class TestDescription {
+ public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize, int[] myLog2MaxSize) {
+ testName = myTestName;
+ test = myTest;
+ seed = mySeed;
+ defSize = myDefSize;
+ log2MaxSize = myLog2MaxSize;
+ };
+
+ public TestDescription(String myTestName, Test myTest, int mySeed, int[] myDefSize) {
+ testName = myTestName;
+ test = myTest;
+ seed = mySeed;
+ defSize = myDefSize;
+ log2MaxSize = null;
+ };
+
+ public final String testName;
+
+ public final Test test;
+
+ // When executing the test, scale this up by maxSeedsPerTest.
+ public final int seed;
+
+ // If we're only going to run the test once, what size should
+ // we use?
+ public final int[] defSize;
+
+ // If we're going to run the test over a range of sizes, what
+ // is the maximum size to use?
+ public final int[] log2MaxSize;
+ };
+
+ private boolean run(TestDescription td, RenderScript RS, ScriptC_reduce s, int seed, int[] size) {
+ String arrayContent = "";
+ for (int i = 0; i < size.length; ++i) {
+ if (i != 0)
+ arrayContent += ", ";
+ arrayContent += size[i];
+ }
+ Log.i(TAG, "Running " + td.testName + "(seed = " + seed + ", size[] = {" + arrayContent + "})");
+ return td.test.run(RS, s, seed, size);
+ }
+
+ private final TestDescription[] correctnessTests = {
+ // alloc and array variants of the same test will use the same
+ // seed, in case results need to be compared.
+
+ new TestDescription("addint1D", this::addint1D, 0, new int[]{100000}, new int[]{20}),
+ new TestDescription("addint1D_array", this::addint1D_array, 0, new int[]{100000}, new int[]{20}),
+ new TestDescription("addint2D", this::addint2D, 1, new int[]{450, 225}),
+ new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000}, new int[]{20}),
+ new TestDescription("findMinAndMaxArray", this::findMinAndMax_array, 3, new int[]{100000}, new int[]{20}),
+ new TestDescription("fz", this::fz, 4, new int[]{100000}, new int[]{20}),
+ new TestDescription("fz_array", this::fz_array, 4, new int[]{100000}, new int[]{20}),
+ new TestDescription("fz2", this::fz2, 5, new int[]{225, 450}),
+ new TestDescription("fz3", this::fz3, 6, new int[]{59, 48, 37}),
+ new TestDescription("histogram", this::histogram, 7, new int[]{100000}, new int[]{20}),
+ new TestDescription("histogram_array", this::histogram_array, 7, new int[]{100000}, new int[]{20}),
+ // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}, new int[]{20}),
+ new TestDescription("mode_array", this::mode_array, 8, new int[]{100000}, new int[]{20}),
+ new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 16}, new int[]{20})
+ };
+
+ private boolean runCorrectnessQuick(RenderScript RS, ScriptC_reduce s) {
+ boolean pass = true;
+
+ for (TestDescription td : correctnessTests) {
+ pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize);
+ }
+
+ return pass;
+ }
+
+ private boolean runCorrectness(RenderScript RS, ScriptC_reduce s) {
+ boolean pass = true;
+
+ for (TestDescription td : correctnessTests) {
+ if (td.log2MaxSize == null) // TODO: Eventually this should never happen?
+ continue;
+
+ if (td.log2MaxSize.length == 1) {
+ final int log2MaxSize = td.log2MaxSize[0];
+ // We will execute the test with the following sizes:
+ // (a) Each power of 2 from zero (2**0) up to log2MaxSize (2**log2MaxSize)
+ // (b) Each size from (a) +/-1
+ // (c) 2 random sizes between adjacent points in (a)
+ int[] testSizes = new int[
+ /* a */ (1 + log2MaxSize) +
+ /* b */ 2*(1 + log2MaxSize) +
+ /* c */ 2*log2MaxSize];
+
+ // NOTE: Each test execution gets maxSeedsPerTest, and
+ // there are up to 3 + 5*log2MaxSize test executions
+ // of a test, and we need a seed for (c). Assuming
+ // log2MaxSize does not exceed 32, then it should be
+ // sufficient to reserve 1 + 5*32*maxSeedsPerTest seeds
+ // per TestDescription.
+ final int seedForPickingTestSizes = td.seed * (1 + 5*32*maxSeedsPerTest);
+
+ int nextTestIdx = 0;
+
+ // Fill in (a) and (b)
+ for (int i = 0; i <= log2MaxSize; ++i) {
+ final int pwrOf2 = 1 << i;
+ testSizes[nextTestIdx++] = pwrOf2; /* a */
+ testSizes[nextTestIdx++] = pwrOf2 - 1; /* b */
+ testSizes[nextTestIdx++] = pwrOf2 + 1; /* b */
+ }
+
+ // Fill in (c)
+ Random r = new Random(seedForPickingTestSizes);
+ for (int i = 0; i < log2MaxSize; ++i) {
+ final int lo = (1 << i) + 1;
+ final int hi = 1 << (i + 1);
+
+ if (lo < hi) {
+ for (int j = 0; j < 2; ++j) {
+ testSizes[nextTestIdx++] = r.nextInt(hi - lo) + lo;
+ }
+ }
+ }
+
+ Arrays.sort(testSizes);
+
+ int[] lastTestSizeArg = new int[]{-1};
+ for (int i = 0; i < testSizes.length; ++i) {
+ if ((testSizes[i] > 0) && (testSizes[i] != lastTestSizeArg[0])) {
+ lastTestSizeArg[0] = testSizes[i];
+ final int seedForTestExecution = seedForPickingTestSizes + 1 + i*maxSeedsPerTest;
+ pass &= run(td, RS, s, seedForTestExecution, lastTestSizeArg);
+ }
+ }
+ }
+ // TODO: lengths 2 and 3, and assert otherwise
+ }
+
+ return pass;
+ }
+
+ private final TestDescription[] performanceTests = {
+ new TestDescription("addint1D", this::addint1D, 0, new int[]{100000 << 10}),
+ new TestDescription("addint2D", this::addint2D, 1, new int[]{450 << 5, 225 << 5}),
+ new TestDescription("findMinAndMax", this::findMinAndMax, 3, new int[]{100000 << 9}),
+ new TestDescription("fz", this::fz, 4, new int[]{100000 << 10}),
+ new TestDescription("fz2", this::fz2, 5, new int[]{225 << 5, 450 << 5}),
+ new TestDescription("fz3", this::fz3, 6, new int[]{59 << 3, 48 << 3, 37 << 3}),
+ new TestDescription("histogram", this::histogram, 7, new int[]{100000 << 10}),
+ // might want to add: new TestDescription("mode", this::mode, 8, new int[]{100000}),
+ new TestDescription("sumgcd", this::sumgcd, 9, new int[]{1 << 21})
+ };
+
+ private boolean runPerformanceQuick(RenderScript RS, ScriptC_reduce s) {
+ boolean pass = true;
+
+ for (TestDescription td : performanceTests) {
+ pass &= run(td, RS, s, maxSeedsPerTest * td.seed, td.defSize);
+ }
+
+ return pass;
+ }
+
+
public void run() {
RenderScript pRS = RenderScript.create(mCtx);
ScriptC_reduce s = new ScriptC_reduce(pRS);
@@ -320,15 +810,10 @@
s.set_posInf(Float.POSITIVE_INFINITY);
boolean pass = true;
- pass &= addint1D(pRS, s);
- pass &= addint2D(pRS, s);
- pass &= dp(pRS, s);
- pass &= findMinAndMax(pRS, s);
- pass &= fz(pRS, s);
- pass &= fz2(pRS, s);
- pass &= fz3(pRS, s);
- pass &= histogram(pRS, s);
- pass &= mode(pRS, s);
+
+ pass &= runCorrectnessQuick(pRS, s);
+ pass &= runCorrectness(pRS, s);
+ // pass &= runPerformanceQuick(pRS, s);
pRS.finish();
pRS.destroy();
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
index 3a64a73..6a50d2b 100644
--- a/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_reduce_backward.java
@@ -119,39 +119,6 @@
///////////////////////////////////////////////////////////////////
- private float dp(float[] input1, float[] input2) {
- _RS_ASSERT("dp input length mismatch", input1.length == input2.length);
-
- float rslt = 0;
- for (int idx = 0; idx < input1.length; ++idx)
- rslt += input1[idx] * input2[idx];
- return rslt;
- }
-
- private boolean dp(RenderScript RS, ScriptC_reduce_backward s) {
- final float[] input1 = createInputArrayFloat(100000, 2);
- final float[] input2 = createInputArrayFloat(100000, 3);
-
- final float javaRslt = dp(input1, input2);
- final float rsRslt = s.reduce_dp(input1, input2).get();
-
- // NOTE: Using a floating point equality check to test for
- // correctness -- as we do below -- is a bad idea. It's only
- // reliable if the Java and RenderScript implementation of dp
- // use the same algorithm. Equality could be broken by
- // different optimizations between the two, or running the
- // RenderScript algorithm multithreaded, or running the
- // RenderScript algorithm on a GPU rather than the CPU.
- //
- // Should we be checking instead that the results are
- // "sufficiently close"? Cooking the input set to try to
- // ensure a deterministic result? Changing to integers
- // instead?
- return result("dp", javaRslt, rsRslt);
- }
-
- ///////////////////////////////////////////////////////////////////
-
private Int2 findMinAndMax(float[] input) {
float minVal = Float.POSITIVE_INFINITY;
int minIdx = -1;
@@ -322,7 +289,6 @@
boolean pass = true;
pass &= addint1D(pRS, s);
pass &= addint2D(pRS, s);
- pass &= dp(pRS, s);
pass &= findMinAndMax(pRS, s);
pass &= fz(pRS, s);
pass &= fz2(pRS, s);
diff --git a/java/tests/RsTest/src/com/android/rs/test/math_fp16.rs b/java/tests/RsTest/src/com/android/rs/test/math_fp16.rs
index 331a871..eef3a8a 100644
--- a/java/tests/RsTest/src/com/android/rs/test/math_fp16.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/math_fp16.rs
@@ -88,6 +88,12 @@
h1 = fn(h3); \
h1 = fn(h4);
+#define TEST_H_FUNC_HN_HN(fn) \
+ h1 = fn(h1, h1); \
+ h1 = fn(h2, h2); \
+ h1 = fn(h3, h3); \
+ h1 = fn(h4, h4);
+
static bool testAPI() {
TEST_HN_FUNC_HN(acos);
TEST_HN_FUNC_HN(acosh);
@@ -138,7 +144,6 @@
TEST_IN_FUNC_HN(ilogb);
TEST_HN_FUNC_HN_IN(ldexp);
TEST_HN_FUNC_HN_I(ldexp);
- TEST_H_FUNC_HN(length);
TEST_HN_FUNC_HN(lgamma);
TEST_HN_FUNC_HN_PIN(lgamma);
@@ -191,7 +196,6 @@
TEST_HN_FUNC_HN(native_log1p);
TEST_HN_FUNC_HN(native_log2);
- TEST_HN_FUNC_HN(native_normalize);
TEST_HN_FUNC_HN_HN(native_powr);
TEST_HN_FUNC_HN(native_recip);
TEST_HN_FUNC_HN_IN(native_rootn);
@@ -207,7 +211,6 @@
TEST_HN_FUNC_HN(native_tanpi);
TEST_HN_FUNC_HN_HN(nextafter);
- TEST_HN_FUNC_HN(normalize);
TEST_HN_FUNC_HN_HN(pow);
TEST_HN_FUNC_HN_IN(pown);
TEST_HN_FUNC_HN_HN(powr);
@@ -241,6 +244,14 @@
// Vector math functions
h3 = cross(h3, h3);
h4 = cross(h4, h4);
+
+ TEST_H_FUNC_HN_HN(distance);
+ TEST_H_FUNC_HN_HN(dot);
+ TEST_H_FUNC_HN(length);
+ TEST_H_FUNC_HN_HN(native_distance);
+ TEST_H_FUNC_HN(native_length);
+ TEST_HN_FUNC_HN(native_normalize);
+ TEST_HN_FUNC_HN(normalize);
return true;
}
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce.rs b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
index be09dfb..97b45e0 100644
--- a/java/tests/RsTest/src/com/android/rs/test/reduce.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce.rs
@@ -16,18 +16,6 @@
/////////////////////////////////////////////////////////////////////////
-#pragma rs reduce(dp) \
- accumulator(dpAccum) combiner(dpSum)
-
-static void dpAccum(float *accum, float in1, float in2) {
- *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-/////////////////////////////////////////////////////////////////////////
-
#pragma rs reduce(findMinAndMax) \
initializer(fMMInit) accumulator(fMMAccumulator) \
combiner(fMMCombiner) outconverter(fMMOutConverter)
@@ -61,8 +49,10 @@
static void fMMCombiner(MinAndMax *accum,
const MinAndMax *val) {
- fMMAccumulator(accum, val->min.val, val->min.idx);
- fMMAccumulator(accum, val->max.val, val->max.idx);
+ if (val->min.val < accum->min.val)
+ accum->min = val->min;
+ if (val->max.val > accum->max.val)
+ accum->max = val->max;
}
static void fMMOutConverter(int2 *result,
@@ -160,3 +150,24 @@
result->x = mode;
result->y = (*h)[mode];
}
+
+/////////////////////////////////////////////////////////////////////////
+
+#pragma rs reduce(sumgcd) accumulator(sgAccum) combiner(sgCombine)
+
+static int gcd(int a, int b) {
+ while (b != 0) {
+ const int aNew = b;
+ const int bNew = a % b;
+
+ a = aNew;
+ b = bNew;
+ }
+ return a;
+}
+
+static void sgAccum(long *accum, int a, int b) {
+ *accum += gcd(a, b);
+}
+
+static void sgCombine(long *accum, const long *other) { *accum += *other; }
diff --git a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
index 419e709..41252c8 100644
--- a/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
+++ b/java/tests/RsTest/src/com/android/rs/test/reduce_backward.rs
@@ -15,18 +15,6 @@
/////////////////////////////////////////////////////////////////////////
-static void dpAccum(float *accum, float in1, float in2) {
- *accum += in1*in2;
-}
-
-// combiner function
-static void dpSum(float *accum, const float *val) { *accum += *val; }
-
-#pragma rs reduce(dp) \
- accumulator(dpAccum) combiner(dpSum)
-
-/////////////////////////////////////////////////////////////////////////
-
typedef struct {
float val;
int idx;
@@ -56,8 +44,10 @@
static void fMMCombiner(MinAndMax *accum,
const MinAndMax *val) {
- fMMAccumulator(accum, val->min.val, val->min.idx);
- fMMAccumulator(accum, val->max.val, val->max.idx);
+ if (val->min.val < accum->min.val)
+ accum->min = val->min;
+ if (val->max.val > accum->max.val)
+ accum->max = val->max;
}
static void fMMOutConverter(int2 *result,
diff --git a/rsContext.cpp b/rsContext.cpp
index 737d636..eab8bae 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -259,6 +259,8 @@
rsc->props.mLogShadersAttr = getProp("debug.rs.shader.attributes") != 0;
rsc->props.mLogShadersUniforms = getProp("debug.rs.shader.uniforms") != 0;
rsc->props.mLogVisual = getProp("debug.rs.visual") != 0;
+ rsc->props.mLogReduceAccum = getProp("debug.rs.reduce-accum") != 0;
+ rsc->props.mDebugReduceSplitAccum = getProp("debug.rs.reduce-split-accum") != 0;
rsc->props.mDebugMaxThreads = getProp("debug.rs.max-threads");
if (getProp("debug.rs.debug") != 0) {
diff --git a/rsContext.h b/rsContext.h
index 890459d..e809792 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -227,6 +227,8 @@
bool mLogShadersAttr;
bool mLogShadersUniforms;
bool mLogVisual;
+ bool mLogReduceAccum;
+ bool mDebugReduceSplitAccum;
uint32_t mDebugMaxThreads;
} props;
diff --git a/scriptc/rs_convert.rsh b/scriptc/rs_convert.rsh
index 9ffc183..146e192 100644
--- a/scriptc/rs_convert.rsh
+++ b/scriptc/rs_convert.rsh
@@ -1247,6 +1247,21 @@
#endif
#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half2 __attribute__((const, overloadable))
+ convert_half2(half2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half3 __attribute__((const, overloadable))
+ convert_half3(half3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half4 __attribute__((const, overloadable))
+ convert_half4(half4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
extern float2 __attribute__((const, overloadable))
convert_float2(half2 v);
#endif
diff --git a/scriptc/rs_object_types.rsh b/scriptc/rs_object_types.rsh
index 671873e..e6511a5 100644
--- a/scriptc/rs_object_types.rsh
+++ b/scriptc/rs_object_types.rsh
@@ -114,7 +114,7 @@
#if (defined(RS_VERSION) && (RS_VERSION >= 14))
typedef enum {
RS_ALLOCATION_USAGE_SCRIPT = 0x0001, // Allocation is bound to and accessed by scripts.
- RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE = 0x0002, // Deprecated.
+ RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE = 0x0002, // Allocation is used as a texture source.
RS_ALLOCATION_USAGE_GRAPHICS_VERTEX = 0x0004, // Deprecated.
RS_ALLOCATION_USAGE_GRAPHICS_CONSTANTS = 0x0008, // Deprecated.
RS_ALLOCATION_USAGE_GRAPHICS_RENDER_TARGET = 0x0010, // Deprecated.
diff --git a/scriptc/rs_vector_math.rsh b/scriptc/rs_vector_math.rsh
index d611464..2f5e8e7 100644
--- a/scriptc/rs_vector_math.rsh
+++ b/scriptc/rs_vector_math.rsh
@@ -294,6 +294,26 @@
native_distance(float4 left_vector, float4 right_vector);
#endif
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+ native_distance(half left_vector, half right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+ native_distance(half2 left_vector, half2 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+ native_distance(half3 left_vector, half3 right_vector);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 24))
+extern half __attribute__((const, overloadable))
+ native_distance(half4 left_vector, half4 right_vector);
+#endif
+
/*
* native_length: Approximate length of a vector
*