Merge "Convert Android.mk to Android.bp" am: 301483980c am: 75436afd61 am: 84ed246fbe am: e50017c8ff am: 2a3df51b4e am: 13a2469386
Original change: https://android-review.googlesource.com/c/platform/frameworks/rs/+/1690365
Change-Id: I45bc066bf46b2001b17da2c70775ddee9054c545
diff --git a/Android.bp b/Android.bp
index ab53382..a2eaf6f 100644
--- a/Android.bp
+++ b/Android.bp
@@ -370,4 +370,5 @@
"cpu_ref",
"script_api",
"support",
+ "toolkit",
]
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index ce30092..d102488 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -109,6 +109,15 @@
namespace android {
namespace renderscript {
+// Convert vector to uchar4, clipping each value to 255.
+template <typename TI>
+static inline uchar4 convertClipped(TI amount) {
+ return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x),
+ static_cast<uchar>(amount.y > 255 ? 255 : amount.y),
+ static_cast<uchar>(amount.z > 255 ? 255 : amount.z),
+ static_cast<uchar>(amount.w > 255 ? 255 : amount.w)};
+}
+
void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelDriverInfo *info,
uint32_t xstart, uint32_t xend,
uint32_t outstep) {
@@ -120,8 +129,11 @@
#if defined(ARCH_ARM_USE_INTRINSICS)
if (gArchUseSIMD) {
- if (rsdIntrinsicBlend_K(out, in, info->slot, x1, x2) >= 0)
+ if (rsdIntrinsicBlend_K(out, in, info->slot, 0, x2 - x1) >= 0) {
return;
+ } else {
+ ALOGW("Intrinsic Blend failed to use SIMD for %d", info->slot);
+ }
}
#endif
switch (info->slot) {
@@ -151,10 +163,10 @@
}
#endif
for (;x1 < x2; x1++, out++, in++) {
- short4 in_s = convert_short4(*in);
- short4 out_s = convert_short4(*out);
- in_s = in_s + ((out_s * (short4)(255 - in_s.w)) >> (short4)8);
- *out = convert_uchar4(in_s);
+ ushort4 in_s = convert_ushort4(*in);
+ ushort4 out_s = convert_ushort4(*out);
+ in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8);
+ *out = convertClipped(in_s);
}
break;
case BLEND_DST_OVER:
@@ -170,10 +182,10 @@
}
#endif
for (;x1 < x2; x1++, out++, in++) {
- short4 in_s = convert_short4(*in);
- short4 out_s = convert_short4(*out);
- in_s = out_s + ((in_s * (short4)(255 - out_s.w)) >> (short4)8);
- *out = convert_uchar4(in_s);
+ ushort4 in_s = convert_ushort4(*in);
+ ushort4 out_s = convert_ushort4(*out);
+ in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8);
+ *out = convertClipped(in_s);
}
break;
case BLEND_SRC_IN:
@@ -189,8 +201,8 @@
}
#endif
for (;x1 < x2; x1++, out++, in++) {
- short4 in_s = convert_short4(*in);
- in_s = (in_s * out->w) >> (short4)8;
+ ushort4 in_s = convert_ushort4(*in);
+ in_s = (in_s * out->w) >> (ushort4)8;
*out = convert_uchar4(in_s);
}
break;
@@ -261,11 +273,14 @@
}
#endif
for (;x1 < x2; x1++, out++, in++) {
- short4 in_s = convert_short4(*in);
- short4 out_s = convert_short4(*out);
+ // The max value the operation could produce before the shift
+ // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02.
+ // That value does not fit in a ushort, so we use uint.
+ uint4 in_s = convert_uint4(*in);
+ uint4 out_s = convert_uint4(*out);
out_s.xyz = ((in_s.xyz * out_s.w) +
- (out_s.xyz * ((short3)255 - (short3)in_s.w))) >> (short3)8;
- *out = convert_uchar4(out_s);
+ (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8;
+ *out = convertClipped(out_s);
}
break;
case BLEND_DST_ATOP:
@@ -281,12 +296,12 @@
}
#endif
for (;x1 < x2; x1++, out++, in++) {
- short4 in_s = convert_short4(*in);
- short4 out_s = convert_short4(*out);
+ uint4 in_s = convert_uint4(*in);
+ uint4 out_s = convert_uint4(*out);
out_s.xyz = ((out_s.xyz * in_s.w) +
- (in_s.xyz * ((short3)255 - (short3)out_s.w))) >> (short3)8;
+ (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8;
out_s.w = in_s.w;
- *out = convert_uchar4(out_s);
+ *out = convertClipped(out_s);
}
break;
case BLEND_XOR:
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 8a3dd1a..8afa2ed 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -353,7 +353,7 @@
const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
- uchar4 *out = ((uchar4 *)info->outPtr[0]) + xstart;
+ uchar4 *out = ((uchar4 *)info->outPtr[0]);
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -430,7 +430,7 @@
const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
- uchar2 *out = ((uchar2 *)info->outPtr[0]) + xstart;
+ uchar2 *out = ((uchar2 *)info->outPtr[0]);
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -508,7 +508,7 @@
const uchar *yp2 = pin + stride * ys2;
const uchar *yp3 = pin + stride * ys3;
- uchar *out = ((uchar *)info->outPtr[0]) + xstart;
+ uchar *out = ((uchar *)info->outPtr[0]);
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -586,7 +586,7 @@
const float4 *yp2 = (const float4 *)(pin + stride * ys2);
const float4 *yp3 = (const float4 *)(pin + stride * ys3);
- float4 *out = ((float4 *)info->outPtr[0]) + xstart;
+ float4 *out = ((float4 *)info->outPtr[0]);
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -638,7 +638,7 @@
const float2 *yp2 = (const float2 *)(pin + stride * ys2);
const float2 *yp3 = (const float2 *)(pin + stride * ys3);
- float2 *out = ((float2 *)info->outPtr[0]) + xstart;
+ float2 *out = ((float2 *)info->outPtr[0]);
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -690,7 +690,7 @@
const float *yp2 = (const float *)(pin + stride * ys2);
const float *yp3 = (const float *)(pin + stride * ys3);
- float *out = ((float *)info->outPtr[0]) + xstart;
+ float *out = ((float *)info->outPtr[0]);
uint32_t x1 = xstart;
uint32_t x2 = xend;
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
index b4a2b7c..1473336 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
@@ -593,7 +593,7 @@
ENTRY(rsdIntrinsicBlend_K)
adrp x5, blendtable
add x5, x5, :lo12:blendtable
- cmp w2, tablesize >> 1
+ cmp w2, tablesize
bhs 1f
ldrsh x6, [x5, w2, uxtw #1]
add x0, x0, w3, uxtw #2
@@ -615,4 +615,3 @@
#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
BLEND_LIST(BLEND_X)
#undef BLEND_X
-
diff --git a/toolkit/Android.bp b/toolkit/Android.bp
new file mode 100644
index 0000000..d3fa21e
--- /dev/null
+++ b/toolkit/Android.bp
@@ -0,0 +1,135 @@
+package {
+ default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+// TODO: In later CLs, this build file will be replaced by a stand alone build that's not part of Android.
+
+cc_binary {
+ name: "renderscripttoolkittest",
+ srcs: [
+ "TestTaskProcessor.cpp"
+ ],
+ shared_libs: [
+ "libbase",
+ "librenderscripttoolkit",
+ ],
+}
+
+cc_library_shared {
+ name: "librenderscripttoolkit",
+ defaults: [],
+ vendor_available: false,
+ native_bridge_supported: false,
+ vndk: {
+ enabled: false,
+ support_system_process: false,
+ },
+
+ srcs: [
+ "Blend.cpp",
+ "Blur.cpp",
+ "ColorMatrix.cpp",
+ "Convolve3x3.cpp",
+ "Convolve5x5.cpp",
+ "Histogram.cpp",
+ "Lut.cpp",
+ "Lut3d.cpp",
+ "RenderScriptToolkit.cpp",
+ "Resize.cpp",
+ "TaskProcessor.cpp",
+ "Utils.cpp",
+ "YuvToRgb.cpp",
+ ],
+
+ static_libs: [ "cpufeatures" ],
+
+ arch: {
+ arm64: {
+ cflags: [
+ "-DARCH_ARM_USE_INTRINSICS",
+ "-DARCH_ARM64_USE_INTRINSICS",
+ "-DARCH_ARM64_HAVE_NEON",
+ ],
+
+ srcs: [
+ "Blend_advsimd.S",
+ "Blur_advsimd.S",
+ "ColorMatrix_advsimd.S",
+ "Convolve_advsimd.S",
+ "Lut3d_advsimd.S",
+ "Resize_advsimd.S",
+ "YuvToRgb_advsimd.S",
+ ],
+ },
+
+ arm: {
+ cflags: [
+ "-DARCH_ARM_HAVE_VFP",
+ "-DARCH_ARM_USE_INTRINSICS",
+ ],
+
+ srcs: [
+ "Blend_neon.S",
+ "Blur_neon.S",
+ "ColorMatrix_neon.S",
+ "Convolve_neon.S",
+ "Lut3d_neon.S",
+ "Resize_neon.S",
+ "YuvToRgb_neon.S",
+ ],
+
+ asflags: ["-mfpu=neon"],
+
+ neon: {
+ cflags: [
+ "-DARCH_ARM_HAVE_NEON",
+ ],
+ },
+ },
+
+ x86: {
+ cflags: ["-DARCH_X86_HAVE_SSSE3"],
+ srcs: ["x86.cpp"],
+ },
+ x86_64: {
+ cflags: ["-DARCH_X86_HAVE_SSSE3"],
+ srcs: ["x86.cpp"],
+ avx2: {
+ cflags: ["-DARCH_X86_HAVE_AVX2", "-mavx2", "-mfma"],
+ },
+ },
+ },
+
+ shared_libs: [
+ "libbase",
+ "liblog",
+ "libnativehelper",
+ "libjnigraphics",
+ ],
+ header_libs: [
+ // TODO Once we compile in the .cpp files, check if any of these libraries are needed.
+ //"libutils_headers",
+ //"libhardware_headers",
+ ],
+
+ include_dirs: [
+ ],
+
+ cflags: [
+ "-Wthread-safety",
+ "-Werror",
+ "-Wall",
+ "-Wextra",
+ "-Wno-unused-parameter",
+ "-Wno-unused-variable",
+ ],
+
+ // TODO: Is this needed?
+ product_variables: {
+ pdk: {
+ // Not building RenderScript modules in PDK builds, as libmediandk
+ // is not available in PDK.
+ enabled: false,
+ },
+ },
+}
diff --git a/toolkit/Blend.cpp b/toolkit/Blend.cpp
new file mode 100644
index 0000000..1f6319e
--- /dev/null
+++ b/toolkit/Blend.cpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Blend"
+
+/**
+ * Blends a source into a destination, based on the mode.
+ */
+class BlendTask : public Task {
+ // The type of blending to do.
+ RenderScriptToolkit::BlendingMode mMode;
+ // The input we're blending.
+ const uchar4* mIn;
+ // The destination, used both for input and output.
+ uchar4* mOut;
+
+ void blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
+ uint32_t length);
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ public:
+ BlendTask(RenderScriptToolkit::BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
+ size_t sizeY, const Restriction* restriction)
+ : Task{sizeX, sizeY, 4, true, restriction},
+ mMode{mode},
+ mIn{reinterpret_cast<const uchar4*>(in)},
+ mOut{reinterpret_cast<uchar4*>(out)} {}
+};
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+extern "C" int rsdIntrinsicBlend_K(uchar4 *out, uchar4 const *in, int slot,
+ uint32_t xstart, uint32_t xend);
+#endif
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+extern void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
+#endif
+
+// Convert vector to uchar4, clipping each value to 255.
+template <typename TI>
+static inline uchar4 convertClipped(TI amount) {
+ return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x),
+ static_cast<uchar>(amount.y > 255 ? 255 : amount.y),
+ static_cast<uchar>(amount.z > 255 ? 255 : amount.z),
+ static_cast<uchar>(amount.w > 255 ? 255 : amount.w)};
+}
+
+void BlendTask::blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
+ uint32_t length) {
+ uint32_t x1 = 0;
+ uint32_t x2 = length;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+ if (mUsesSimd) {
+ if (rsdIntrinsicBlend_K(out, in, (int) mode, x1, x2) >= 0) {
+ return;
+ } else {
+ ALOGW("Intrinsic Blend failed to use SIMD for %d", mode);
+ }
+ }
+#endif
+ switch (mode) {
+ case RenderScriptToolkit::BlendingMode::CLEAR:
+ for (;x1 < x2; x1++, out++) {
+ *out = 0;
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::SRC:
+ for (;x1 < x2; x1++, out++, in++) {
+ *out = *in;
+ }
+ break;
+ //RenderScriptToolkit::BlendingMode::DST is a NOP
+ case RenderScriptToolkit::BlendingMode::DST:
+ break;
+ case RenderScriptToolkit::BlendingMode::SRC_OVER:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSrcOver_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ ushort4 in_s = convert<ushort4>(*in);
+ ushort4 out_s = convert<ushort4>(*out);
+ in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8);
+ *out = convertClipped(in_s);
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::DST_OVER:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendDstOver_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ ushort4 in_s = convert<ushort4>(*in);
+ ushort4 out_s = convert<ushort4>(*out);
+ in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8);
+ *out = convertClipped(in_s);
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::SRC_IN:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSrcIn_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+#endif
+ for (;x1 < x2; x1++, out++, in++) {
+ ushort4 in_s = convert<ushort4>(*in);
+ in_s = (in_s * out->w) >> (ushort4)8;
+ *out = convert<uchar4>(in_s);
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::DST_IN:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendDstIn_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ ushort4 out_s = convert<ushort4>(*out);
+ out_s = (out_s * in->w) >> (ushort4)8;
+ *out = convert<uchar4>(out_s);
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::SRC_OUT:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSrcOut_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ ushort4 in_s = convert<ushort4>(*in);
+ in_s = (in_s * (ushort4)(255 - out->w)) >> (ushort4)8;
+ *out = convert<uchar4>(in_s);
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::DST_OUT:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendDstOut_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ ushort4 out_s = convert<ushort4>(*out);
+ out_s = (out_s * (ushort4)(255 - in->w)) >> (ushort4)8;
+ *out = convert<uchar4>(out_s);
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::SRC_ATOP:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSrcAtop_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ // The max value the operation could produce before the shift
+ // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02.
+ // That value does not fit in a ushort, so we use uint.
+ uint4 in_s = convert<uint4>(*in);
+ uint4 out_s = convert<uint4>(*out);
+ out_s.xyz = ((in_s.xyz * out_s.w) +
+ (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8;
+ *out = convertClipped(out_s);
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::DST_ATOP:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendDstAtop_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ uint4 in_s = convert<uint4>(*in);
+ uint4 out_s = convert<uint4>(*out);
+ out_s.xyz = ((out_s.xyz * in_s.w) +
+ (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8;
+ out_s.w = in_s.w;
+ *out = convertClipped(out_s);
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::XOR:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendXor_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ *out = *in ^ *out;
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::MULTIPLY:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendMultiply_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ *out = convert<uchar4>((convert<ushort4>(*in) * convert<ushort4>(*out))
+ >> (ushort4)8);
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::ADD:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendAdd_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ uint32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
+ oR = out->x, oG = out->y, oB = out->z, oA = out->w;
+ out->x = (oR + iR) > 255 ? 255 : oR + iR;
+ out->y = (oG + iG) > 255 ? 255 : oG + iG;
+ out->z = (oB + iB) > 255 ? 255 : oB + iB;
+ out->w = (oA + iA) > 255 ? 255 : oA + iA;
+ }
+ break;
+ case RenderScriptToolkit::BlendingMode::SUBTRACT:
+ #if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if((x1 + 8) < x2) {
+ uint32_t len = (x2 - x1) >> 3;
+ rsdIntrinsicBlendSub_K(out, in, len);
+ x1 += len << 3;
+ out += len << 3;
+ in += len << 3;
+ }
+ }
+ #endif
+ for (;x1 < x2; x1++, out++, in++) {
+ int32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
+ oR = out->x, oG = out->y, oB = out->z, oA = out->w;
+ out->x = (oR - iR) < 0 ? 0 : oR - iR;
+ out->y = (oG - iG) < 0 ? 0 : oG - iG;
+ out->z = (oB - iB) < 0 ? 0 : oB - iB;
+ out->w = (oA - iA) < 0 ? 0 : oA - iA;
+ }
+ break;
+
+ default:
+ ALOGE("Called unimplemented value %d", mode);
+ assert(false);
+ }
+}
+
+void BlendTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ for (size_t y = startY; y < endY; y++) {
+ size_t offset = y * mSizeX + startX;
+ blend(mMode, mIn + offset, mOut + offset, endX - startX);
+ }
+}
+
+void RenderScriptToolkit::blend(BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
+ size_t sizeY, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+ return;
+ }
+#endif
+
+ BlendTask task(mode, in, out, sizeX, sizeY, restriction);
+ processor->doTask(&task);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/Blend_advsimd.S b/toolkit/Blend_advsimd.S
new file mode 100644
index 0000000..e5cb29b
--- /dev/null
+++ b/toolkit/Blend_advsimd.S
@@ -0,0 +1,622 @@
+/*
+ * Copyright (C) 2013-2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+#define BLEND_LIST(X) \
+ X(0, CLEAR) \
+ X(1, SRC) \
+ X(2, DST) \
+ X(3, SRC_OVER) \
+ X(4, DST_OVER) \
+ X(5, SRC_IN) \
+ X(6, DST_IN) \
+ X(7, SRC_OUT) \
+ X(8, DST_OUT) \
+ X(9, SRC_ATOP) \
+ X(10, DST_ATOP) \
+ X(11, XOR) \
+ X(12, MULTIPLY) \
+ X(13, ADD) \
+ X(14, SUBTRACT)
+
+/* This operation was not enabled in the original RenderScript. We could
+ * enable it.
+ *
+ * X(15, DIFFERENCE) \
+ */
+
+/* For every blend operation supported, define a macro with just the arithmetic
+ * component. The rest can be handled later on.
+ *
+ * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
+ * contain the data from the source buffer. Both have already been split out
+ * into one colour component per register (if necessary). q3 and q11 contain
+ * the alpha components.
+ *
+ * At the same time as defining the assembly macro, define a corresponding
+ * preprocessor macro indicating any other requirements.
+ * zipped=0 -- The macro does not require the RGBA components to be
+ * separated.
+ * lddst=0 -- The macro does not require data from the destination buffer.
+ * ldsrc=0 -- The macro does not require data from the source buffer.
+ * nowrap=1 -- The macro requires no wrapper at all, and should simply be
+ * inserted without any surrounding load/store or loop code.
+ */
+
+#define params_CLEAR zipped=0, lddst=0, ldsrc=0
+.macro blend_kernel_CLEAR
+ movi v0.16b, #0
+ movi v1.16b, #0
+ movi v2.16b, #0
+ movi v3.16b, #0
+.endm
+
+#define params_SRC zipped=0, lddst=0
+.macro blend_kernel_SRC
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+.endm
+
+#define params_DST nowrap=1
+.macro blend_kernel_DST
+ /* nop */
+.endm
+
+#define params_SRC_OVER zipped=1
+.macro blend_kernel_SRC_OVER
+ mvn v7.16b, v11.16b
+
+ umull2 v12.8h, v7.16b, v0.16b
+ umull v0.8h, v7.8b, v0.8b
+ umull2 v13.8h, v7.16b, v1.16b
+ umull v1.8h, v7.8b, v1.8b
+ umull2 v14.8h, v7.16b, v2.16b
+ umull v2.8h, v7.8b, v2.8b
+ umull2 v15.8h, v7.16b, v3.16b
+ umull v3.8h, v7.8b, v3.8b
+
+ rshrn v4.8b, v0.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v1.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v2.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v3.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v0.8h, v0.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v1.8h, v1.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v2.8h, v2.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v3.8h, v3.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v0.8b, v0.8h, #8
+ rshrn2 v0.16b, v12.8h, #8
+ rshrn v1.8b, v1.8h, #8
+ rshrn2 v1.16b, v13.8h, #8
+ rshrn v2.8b, v2.8h, #8
+ rshrn2 v2.16b, v14.8h, #8
+ rshrn v3.8b, v3.8h, #8
+ rshrn2 v3.16b, v15.8h, #8
+
+ uqadd v0.16b, v0.16b, v8.16b
+ uqadd v1.16b, v1.16b, v9.16b
+ uqadd v2.16b, v2.16b, v10.16b
+ uqadd v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DST_OVER zipped=1
+.macro blend_kernel_DST_OVER
+ mvn v7.16b, v3.16b
+
+ umull2 v12.8h, v7.16b, v8.16b
+ umull v8.8h, v7.8b, v8.8b
+ umull2 v13.8h, v7.16b, v9.16b
+ umull v9.8h, v7.8b, v9.8b
+ umull2 v14.8h, v7.16b, v10.16b
+ umull v10.8h, v7.8b, v10.8b
+ umull2 v15.8h, v7.16b, v11.16b
+ umull v11.8h, v7.8b, v11.8b
+
+ rshrn v4.8b, v8.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v9.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v10.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v11.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v8.8h, v8.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v9.8h, v9.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v10.8h, v10.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v11.8h, v11.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v8.8b, v8.8h, #8
+ rshrn2 v8.16b, v12.8h, #8
+ rshrn v9.8b, v9.8h, #8
+ rshrn2 v9.16b, v13.8h, #8
+ rshrn v10.8b, v10.8h, #8
+ rshrn2 v10.16b, v14.8h, #8
+ rshrn v11.8b, v11.8h, #8
+ rshrn2 v11.16b, v15.8h, #8
+
+ uqadd v0.16b, v0.16b, v8.16b
+ uqadd v1.16b, v1.16b, v9.16b
+ uqadd v2.16b, v2.16b, v10.16b
+ uqadd v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SRC_IN zipped=1
+.macro blend_kernel_SRC_IN
+ umull2 v12.8h, v3.16b, v8.16b
+ umull v0.8h, v3.8b, v8.8b
+ umull2 v13.8h, v3.16b, v9.16b
+ umull v1.8h, v3.8b, v9.8b
+ umull2 v14.8h, v3.16b, v10.16b
+ umull v2.8h, v3.8b, v10.8b
+ umull2 v15.8h, v3.16b, v11.16b
+ umull v3.8h, v3.8b, v11.8b
+
+ rshrn v4.8b, v0.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v1.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v2.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v3.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v0.8h, v0.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v1.8h, v1.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v2.8h, v2.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v3.8h, v3.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v0.8b, v0.8h, #8
+ rshrn2 v0.16b, v12.8h, #8
+ rshrn v1.8b, v1.8h, #8
+ rshrn2 v1.16b, v13.8h, #8
+ rshrn v2.8b, v2.8h, #8
+ rshrn2 v2.16b, v14.8h, #8
+ rshrn v3.8b, v3.8h, #8
+ rshrn2 v3.16b, v15.8h, #8
+.endm
+
+#define params_DST_IN zipped=1
+.macro blend_kernel_DST_IN
+ umull2 v12.8h, v0.16b, v11.16b
+ umull v0.8h, v0.8b, v11.8b
+ umull2 v13.8h, v1.16b, v11.16b
+ umull v1.8h, v1.8b, v11.8b
+ umull2 v14.8h, v2.16b, v11.16b
+ umull v2.8h, v2.8b, v11.8b
+ umull2 v15.8h, v3.16b, v11.16b
+ umull v3.8h, v3.8b, v11.8b
+
+ rshrn v4.8b, v0.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v1.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v2.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v3.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v0.8h, v0.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v1.8h, v1.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v2.8h, v2.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v3.8h, v3.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v0.8b, v0.8h, #8
+ rshrn2 v0.16b, v12.8h, #8
+ rshrn v1.8b, v1.8h, #8
+ rshrn2 v1.16b, v13.8h, #8
+ rshrn v2.8b, v2.8h, #8
+ rshrn2 v2.16b, v14.8h, #8
+ rshrn v3.8b, v3.8h, #8
+ rshrn2 v3.16b, v15.8h, #8
+.endm
+
+#define params_SRC_OUT zipped=1
+.macro blend_kernel_SRC_OUT
+ mvn v3.16b, v3.16b
+ blend_kernel_SRC_IN
+.endm
+
+
+#define params_DST_OUT zipped=1
+.macro blend_kernel_DST_OUT
+ mvn v11.16b, v11.16b
+ blend_kernel_DST_IN
+.endm
+
+#define params_SRC_ATOP zipped=1
+.macro blend_kernel_SRC_ATOP
+ mvn v11.16b, v11.16b
+
+ umull2 v12.8h, v11.16b, v0.16b
+ umull v0.8h, v11.8b, v0.8b
+ umull2 v13.8h, v11.16b, v1.16b
+ umull v1.8h, v11.8b, v1.8b
+ umull2 v14.8h, v11.16b, v2.16b
+ umull v2.8h, v11.8b, v2.8b
+
+ umull2 v4.8h, v3.16b, v8.16b
+ umull v8.8h, v3.8b, v8.8b
+ umull2 v5.8h, v3.16b, v9.16b
+ umull v9.8h, v3.8b, v9.8b
+ umull2 v6.8h, v3.16b, v10.16b
+ umull v10.8h, v3.8b, v10.8b
+
+ uqadd v12.8h, v12.8h, v4.8h
+ uqadd v0.8h, v0.8h, v8.8h
+ uqadd v13.8h, v13.8h, v5.8h
+ uqadd v1.8h, v1.8h, v9.8h
+ uqadd v14.8h, v14.8h, v6.8h
+ uqadd v2.8h, v2.8h, v10.8h
+
+ urshr v8.8h, v0.8h, #8
+ urshr v4.8h, v12.8h, #8
+ urshr v9.8h, v1.8h, #8
+ urshr v5.8h, v13.8h, #8
+ urshr v10.8h, v2.8h, #8
+ urshr v6.8h, v14.8h, #8
+
+ uqadd v0.8h, v0.8h, v8.8h
+ uqadd v12.8h, v12.8h, v4.8h
+ uqadd v1.8h, v1.8h, v9.8h
+ uqadd v13.8h, v13.8h, v5.8h
+ uqadd v2.8h, v2.8h, v10.8h
+ uqadd v14.8h, v14.8h, v6.8h
+
+ uqrshrn v0.8b, v0.8h, #8
+ uqrshrn2 v0.16b, v12.8h, #8
+ uqrshrn v1.8b, v1.8h, #8
+ uqrshrn2 v1.16b, v13.8h, #8
+ uqrshrn v2.8b, v2.8h, #8
+ uqrshrn2 v2.16b, v14.8h, #8
+.endm
+
+#define params_DST_ATOP zipped=1
+.macro blend_kernel_DST_ATOP
+ mvn v3.16b, v3.16b
+
+ umull2 v12.8h, v11.16b, v0.16b
+ umull v0.8h, v11.8b, v0.8b
+ umull2 v13.8h, v11.16b, v1.16b
+ umull v1.8h, v11.8b, v1.8b
+ umull2 v14.8h, v11.16b, v2.16b
+ umull v2.8h, v11.8b, v2.8b
+
+ umull2 v4.8h, v3.16b, v8.16b
+ umull v8.8h, v3.8b, v8.8b
+ umull2 v5.8h, v3.16b, v9.16b
+ umull v9.8h, v3.8b, v9.8b
+ umull2 v6.8h, v3.16b, v10.16b
+ umull v10.8h, v3.8b, v10.8b
+
+ uqadd v12.8h, v12.8h, v4.8h
+ uqadd v0.8h, v0.8h, v8.8h
+ uqadd v13.8h, v13.8h, v5.8h
+ uqadd v1.8h, v1.8h, v9.8h
+ uqadd v14.8h, v14.8h, v6.8h
+ uqadd v2.8h, v2.8h, v10.8h
+
+ urshr v8.8h, v0.8h, #8
+ urshr v4.8h, v12.8h, #8
+ urshr v9.8h, v1.8h, #8
+ urshr v5.8h, v13.8h, #8
+ urshr v10.8h, v2.8h, #8
+ urshr v6.8h, v14.8h, #8
+
+ uqadd v0.8h, v0.8h, v8.8h
+ uqadd v12.8h, v12.8h, v4.8h
+ uqadd v1.8h, v1.8h, v9.8h
+ uqadd v13.8h, v13.8h, v5.8h
+ uqadd v2.8h, v2.8h, v10.8h
+ uqadd v14.8h, v14.8h, v6.8h
+
+ uqrshrn v0.8b, v0.8h, #8
+ uqrshrn2 v0.16b, v12.8h, #8
+ uqrshrn v1.8b, v1.8h, #8
+ uqrshrn2 v1.16b, v13.8h, #8
+ uqrshrn v2.8b, v2.8h, #8
+ uqrshrn2 v2.16b, v14.8h, #8
+
+ mov v3.16b, v11.16b
+.endm
+
+#define params_MULTIPLY zipped=0
+.macro blend_kernel_MULTIPLY
+ umull2 v12.8h, v0.16b, v8.16b
+ umull v0.8h, v0.8b, v8.8b
+ umull2 v13.8h, v1.16b, v9.16b
+ umull v1.8h, v1.8b, v9.8b
+ umull2 v14.8h, v2.16b, v10.16b
+ umull v2.8h, v2.8b, v10.8b
+ umull2 v15.8h, v3.16b, v11.16b
+ umull v3.8h, v3.8b, v11.8b
+
+ rshrn v4.8b, v0.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v1.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v2.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v3.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v0.8h, v0.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v1.8h, v1.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v2.8h, v2.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v3.8h, v3.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v0.8b, v0.8h, #8
+ rshrn2 v0.16b, v12.8h, #8
+ rshrn v1.8b, v1.8h, #8
+ rshrn2 v1.16b, v13.8h, #8
+ rshrn v2.8b, v2.8h, #8
+ rshrn2 v2.16b, v14.8h, #8
+ rshrn v3.8b, v3.8h, #8
+ rshrn2 v3.16b, v15.8h, #8
+.endm
+
+#define params_ADD zipped=0
+.macro blend_kernel_ADD
+ uqadd v0.16b, v0.16b, v8.16b
+ uqadd v1.16b, v1.16b, v9.16b
+ uqadd v2.16b, v2.16b, v10.16b
+ uqadd v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SUBTRACT zipped=0
+.macro blend_kernel_SUBTRACT
+ uqsub v0.16b, v0.16b, v8.16b
+ uqsub v1.16b, v1.16b, v9.16b
+ uqsub v2.16b, v2.16b, v10.16b
+ uqsub v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DIFFERENCE zipped=0
+.macro blend_kernel_DIFFERENCE
+ uabd v0.16b, v0.16b, v8.16b
+ uabd v1.16b, v1.16b, v9.16b
+ uabd v2.16b, v2.16b, v10.16b
+ uabd v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_XOR zipped=0
+.macro blend_kernel_XOR
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+.endm
+
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop. Various sections of assembly code are dropped or substituted for
+ * simpler operations if they're not needed.
+ */
+.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
+.if \nowrap
+ \kernel
+.else
+ sub x3, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x3]
+ subs x2, x2, #64
+ b 2f
+.align 4
+1:
+ .if \lddst
+ .if \zipped
+ ld4 {v0.16b - v3.16b}, [x0]
+ .else
+ ld1 {v0.16b - v3.16b}, [x0]
+ .endif
+ .endif
+ .if \ldsrc
+ .if \zipped
+ ld4 {v8.16b - v11.16b}, [x1], #64
+ .else
+ ld1 {v8.16b - v11.16b}, [x1], #64
+ .endif
+ .endif
+ .if \pld
+#if 0 /* TODO: test this on real hardware */
+ .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
+ .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
+#endif
+ .endif
+
+ \kernel
+
+ subs x2, x2, #64
+ .if \zipped
+ st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+ .else
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+ .endif
+
+2: bge 1b
+ adds x2, x2, #64
+ beq 2f
+
+ /* To handle the tail portion of the data (something less than 64
+ * bytes) load small power-of-two chunks into working registers. It
+ * doesn't matter where they end up in the register; the same process
+ * will store them back out using the same positions and the operations
+ * don't require data to interact with its neighbours.
+ */
+ movi v0.16b, #0
+ movi v1.16b, #0
+ movi v2.16b, #0
+ movi v3.16b, #0
+
+ movi v8.16b, #0
+ movi v9.16b, #0
+ movi v10.16b, #0
+ movi v11.16b, #0
+
+ tbz x2, #5, 1f
+ .if \lddst ; ld1 {v2.16b,v3.16b}, [x0], #32 ; .endif
+ .if \ldsrc ; ld1 {v10.16b,v11.16b}, [x1], #32 ; .endif
+1: tbz x2, #4, 1f
+ .if \lddst ; ld1 {v1.16b}, [x0], #16 ; .endif
+ .if \ldsrc ; ld1 {v9.16b}, [x1], #16 ; .endif
+1: tbz x2, #3, 1f
+ .if \lddst ; ld1 {v0.d}[1], [x0], #8 ; .endif
+ .if \ldsrc ; ld1 {v8.d}[1], [x1], #8 ; .endif
+1: tbz x2, #2, 1f
+ .if \lddst ; ld1 {v0.s}[1], [x0], #4 ; .endif
+ .if \ldsrc ; ld1 {v8.s}[1], [x1], #4 ; .endif
+1: tbz x2, #1, 1f
+ .if \lddst ; ld1 {v0.h}[1], [x0], #2 ; .endif
+ .if \ldsrc ; ld1 {v8.h}[1], [x1], #2 ; .endif
+1: tbz x2, #0, 1f
+ .if \lddst ; ld1 {v0.b}[1], [x0], #1 ; .endif
+ .if \ldsrc ; ld1 {v8.b}[1], [x1], #1 ; .endif
+1:
+ .if \lddst ; sub x0, x0, x2 ; .endif
+
+.if \zipped
+ /* One small impediment in the process above is that some of the load
+ * operations can't perform byte-wise structure deinterleaving at the
+ * same time as loading only part of a register. So the data is loaded
+ * linearly and unpacked manually at this point.
+ */
+ uzp1 v4.16b, v0.16b, v1.16b
+ uzp2 v5.16b, v0.16b, v1.16b
+ uzp1 v6.16b, v2.16b, v3.16b
+ uzp2 v7.16b, v2.16b, v3.16b
+ uzp1 v0.16b, v4.16b, v6.16b
+ uzp2 v2.16b, v4.16b, v6.16b
+ uzp1 v1.16b, v5.16b, v7.16b
+ uzp2 v3.16b, v5.16b, v7.16b
+
+ uzp1 v4.16b, v8.16b, v9.16b
+ uzp2 v5.16b, v8.16b, v9.16b
+ uzp1 v6.16b, v10.16b, v11.16b
+ uzp2 v7.16b, v10.16b, v11.16b
+ uzp1 v8.16b, v4.16b, v6.16b
+ uzp2 v10.16b, v4.16b, v6.16b
+ uzp1 v9.16b, v5.16b, v7.16b
+ uzp2 v11.16b, v5.16b, v7.16b
+
+ \kernel
+
+ zip1 v4.16b, v0.16b, v2.16b
+ zip2 v6.16b, v0.16b, v2.16b
+ zip1 v5.16b, v1.16b, v3.16b
+ zip2 v7.16b, v1.16b, v3.16b
+ zip1 v0.16b, v4.16b, v5.16b
+ zip2 v1.16b, v4.16b, v5.16b
+ zip1 v2.16b, v6.16b, v7.16b
+ zip2 v3.16b, v6.16b, v7.16b
+ .else
+ \kernel
+ .endif
+
+ tbz x2, #5, 1f
+ st1 {v2.16b,v3.16b}, [x0], #32
+1: tbz x2, #4, 1f
+ st1 {v1.16b}, [x0], #16
+1: tbz x2, #3, 1f
+ st1 {v0.d}[1], [x0], #8
+1: tbz x2, #2, 1f
+ st1 {v0.s}[1], [x0], #4
+1: tbz x2, #1, 1f
+ st1 {v0.h}[1], [x0], #2
+1: tbz x2, #0, 2f
+ st1 {v0.b}[1], [x0], #1
+2: ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+.endif
+ mov x0, #0
+ ret
+.endm
+
+
+/* produce list of blend_line_XX() functions; each function uses the wrap_line
+ * macro, passing it the name of the operation macro it wants along with
+ * optional parameters to remove unnecessary operations.
+ */
+#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
+ BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+#define BLEND_X(d, n) .set tablesize, d+1 ;
+ BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+/* int rsdIntrinsicBlend_K(
+ * uchar4 *out, // x0
+ * uchar4 const *in, // x1
+ * int slot, // x2
+ * size_t xstart, // x3
+ * size_t xend); // x4
+ */
+ENTRY(rsdIntrinsicBlend_K)
+ adrp x5, blendtable
+ add x5, x5, :lo12:blendtable
+ cmp w2, tablesize
+ bhs 1f
+ ldrsh x6, [x5, w2, uxtw #1]
+ add x0, x0, w3, uxtw #2
+ add x1, x1, w3, uxtw #2
+ sub w2, w4, w3
+ ubfiz x2, x2, #2, #32 /* TODO: fix */
+ cbz x6, 1f
+ adr x5, 2f
+ add x6, x5, x6
+2: br x6
+1: mov x0, #-1
+ ret
+
+END(rsdIntrinsicBlend_K)
+
+.rodata
+.set off,0
+blendtable:
+#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
+ BLEND_LIST(BLEND_X)
+#undef BLEND_X
diff --git a/toolkit/Blend_neon.S b/toolkit/Blend_neon.S
new file mode 100644
index 0000000..a1fa1b5
--- /dev/null
+++ b/toolkit/Blend_neon.S
@@ -0,0 +1,617 @@
+/*
+ * Copyright (C) 2013-2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+#define BLEND_LIST(X) \
+ X(0, CLEAR) \
+ X(1, SRC) \
+ X(2, DST) \
+ X(3, SRC_OVER) \
+ X(4, DST_OVER) \
+ X(5, SRC_IN) \
+ X(6, DST_IN) \
+ X(7, SRC_OUT) \
+ X(8, DST_OUT) \
+ X(9, SRC_ATOP) \
+ X(10, DST_ATOP) \
+ X(11, XOR) \
+ X(14, MULTIPLY) \
+ X(21, DIFFERENCE) \
+ X(34, ADD) \
+ X(35, SUBTRACT)
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* For every blend operation supported, define a macro with just the arithmetic
+ * component. The rest can be handled later on.
+ *
+ * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
+ * contain the data from the source buffer. Both have already been split out
+ * into one colour component per register (if necessary). q3 and q11 contain
+ * the alpha components.
+ *
+ * At the same time as defining the assembly macro, define a corresponding
+ * preprocessor macro indicating any other requirements.
+ * zipped=0 -- The macro does not require the RGBA components to be
+ * separated.
+ * lddst=0 -- The macro does not require data from the destination buffer.
+ * ldsrc=0 -- The macro does not require data from the source buffer.
+ * nowrap=1 -- The macro requires no wrapper at all, and should simply be
+ * inserted without any surrounding load/store or loop code.
+ */
+
+#define params_CLEAR zipped=0, lddst=0, ldsrc=0
+.macro blend_kernel_CLEAR
+ vmov.i8 q0, #0
+ vmov.i8 q1, #0
+ vmov.i8 q2, #0
+ vmov.i8 q3, #0
+.endm
+
+#define params_SRC zipped=0, lddst=0
+.macro blend_kernel_SRC
+ vmov q0, q8
+ vmov q1, q9
+ vmov q2, q10
+ vmov q3, q11
+.endm
+
+#define params_DST nowrap=1
+.macro blend_kernel_DST
+ /* nop */
+.endm
+
+#define params_SRC_OVER zipped=1
+.macro blend_kernel_SRC_OVER
+ vmvn q7, q11
+
+ vmull.u8 q12, d15, d1
+ vmull.u8 q0, d14, d0
+ vmull.u8 q13, d15, d3
+ vmull.u8 q1, d14, d2
+ vmull.u8 q14, d15, d5
+ vmull.u8 q2, d14, d4
+ vmull.u8 q15, d15, d7
+ vmull.u8 q3, d14, d6
+
+ vrshrn.u16 d8, q0, #8
+ vrshrn.u16 d9, q12, #8
+ vrshrn.u16 d10, q1, #8
+ vrshrn.u16 d11, q13, #8
+ vrshrn.u16 d12, q2, #8
+ vrshrn.u16 d13, q14, #8
+ vrshrn.u16 d14, q3, #8
+ vrshrn.u16 d15, q15, #8
+
+ vaddw.u8 q0, d8
+ vaddw.u8 q12, d9
+ vaddw.u8 q1, d10
+ vaddw.u8 q13, d11
+ vaddw.u8 q2, d12
+ vaddw.u8 q14, d13
+ vaddw.u8 q3, d14
+ vaddw.u8 q15, d15
+
+ vrshrn.u16 d0, q0, #8
+ vrshrn.u16 d1, q12, #8
+ vrshrn.u16 d2, q1, #8
+ vrshrn.u16 d3, q13, #8
+ vrshrn.u16 d4, q2, #8
+ vrshrn.u16 d5, q14, #8
+ vrshrn.u16 d6, q3, #8
+ vrshrn.u16 d7, q15, #8
+
+ vqadd.u8 q0, q8
+ vqadd.u8 q1, q9
+ vqadd.u8 q2, q10
+ vqadd.u8 q3, q11
+.endm
+
+#define params_DST_OVER zipped=1
+.macro blend_kernel_DST_OVER
+ vmvn q7, q3
+
+ vmull.u8 q12, d15, d17
+ vmull.u8 q8, d14, d16
+ vmull.u8 q13, d15, d19
+ vmull.u8 q9, d14, d18
+ vmull.u8 q14, d15, d21
+ vmull.u8 q10, d14, d20
+ vmull.u8 q15, d15, d23
+ vmull.u8 q11, d14, d22
+
+ vrshrn.u16 d8, q0, #8
+ vrshrn.u16 d9, q12, #8
+ vrshrn.u16 d10, q1, #8
+ vrshrn.u16 d11, q13, #8
+ vrshrn.u16 d12, q2, #8
+ vrshrn.u16 d13, q14, #8
+ vrshrn.u16 d14, q3, #8
+ vrshrn.u16 d15, q15, #8
+
+ vaddw.u8 q8, d8
+ vaddw.u8 q12, d9
+ vaddw.u8 q9, d10
+ vaddw.u8 q13, d11
+ vaddw.u8 q10, d12
+ vaddw.u8 q14, d13
+ vaddw.u8 q11, d14
+ vaddw.u8 q15, d15
+
+ vrshrn.u16 d16, q8, #8
+ vrshrn.u16 d17, q12, #8
+ vrshrn.u16 d18, q9, #8
+ vrshrn.u16 d19, q13, #8
+ vrshrn.u16 d20, q10, #8
+ vrshrn.u16 d21, q14, #8
+ vrshrn.u16 d22, q11, #8
+ vrshrn.u16 d23, q15, #8
+
+ vqadd.u8 q0, q8
+ vqadd.u8 q1, q9
+ vqadd.u8 q2, q10
+ vqadd.u8 q3, q11
+.endm
+
+#define params_SRC_IN zipped=1
+.macro blend_kernel_SRC_IN
+ vmull.u8 q12, d7, d17
+ vmull.u8 q0, d6, d16
+ vmull.u8 q13, d7, d19
+ vmull.u8 q1, d6, d18
+ vmull.u8 q14, d7, d21
+ vmull.u8 q2, d6, d20
+ vmull.u8 q15, d7, d23
+ vmull.u8 q3, d6, d22
+
+ vrshrn.u16 d8, q0, #8
+ vrshrn.u16 d9, q12, #8
+ vrshrn.u16 d10, q1, #8
+ vrshrn.u16 d11, q13, #8
+ vrshrn.u16 d12, q2, #8
+ vrshrn.u16 d13, q14, #8
+ vrshrn.u16 d14, q3, #8
+ vrshrn.u16 d15, q15, #8
+
+ vaddw.u8 q0, d8
+ vaddw.u8 q12, d9
+ vaddw.u8 q1, d10
+ vaddw.u8 q13, d11
+ vaddw.u8 q2, d12
+ vaddw.u8 q14, d13
+ vaddw.u8 q3, d14
+ vaddw.u8 q15, d15
+
+ vrshrn.u16 d0, q0, #8
+ vrshrn.u16 d1, q12, #8
+ vrshrn.u16 d2, q1, #8
+ vrshrn.u16 d3, q13, #8
+ vrshrn.u16 d4, q2, #8
+ vrshrn.u16 d5, q14, #8
+ vrshrn.u16 d6, q3, #8
+ vrshrn.u16 d7, q15, #8
+.endm
+
+#define params_DST_IN zipped=1
+.macro blend_kernel_DST_IN
+ vmull.u8 q12, d1, d23
+ vmull.u8 q0, d0, d22
+ vmull.u8 q13, d3, d23
+ vmull.u8 q1, d2, d22
+ vmull.u8 q14, d5, d23
+ vmull.u8 q2, d4, d22
+ vmull.u8 q15, d7, d23
+ vmull.u8 q3, d6, d22
+
+ vrshrn.u16 d8, q0, #8
+ vrshrn.u16 d9, q12, #8
+ vrshrn.u16 d10, q1, #8
+ vrshrn.u16 d11, q13, #8
+ vrshrn.u16 d12, q2, #8
+ vrshrn.u16 d13, q14, #8
+ vrshrn.u16 d14, q3, #8
+ vrshrn.u16 d15, q15, #8
+
+ vaddw.u8 q0, d8
+ vaddw.u8 q12, d9
+ vaddw.u8 q1, d10
+ vaddw.u8 q13, d11
+ vaddw.u8 q2, d12
+ vaddw.u8 q14, d13
+ vaddw.u8 q3, d14
+ vaddw.u8 q15, d15
+
+ vrshrn.u16 d0, q0, #8
+ vrshrn.u16 d1, q12, #8
+ vrshrn.u16 d2, q1, #8
+ vrshrn.u16 d3, q13, #8
+ vrshrn.u16 d4, q2, #8
+ vrshrn.u16 d5, q14, #8
+ vrshrn.u16 d6, q3, #8
+ vrshrn.u16 d7, q15, #8
+.endm
+
+#define params_SRC_OUT zipped=1
+.macro blend_kernel_SRC_OUT
+ vmvn q3, q3
+ blend_kernel_SRC_IN
+.endm
+
+
+#define params_DST_OUT zipped=1
+.macro blend_kernel_DST_OUT
+ vmvn q11, q11
+ blend_kernel_DST_IN
+.endm
+
+#define params_SRC_ATOP zipped=1
+.macro blend_kernel_SRC_ATOP
+ vmvn q11, q11
+
+ vmull.u8 q12, d23, d1
+ vmull.u8 q0, d22, d0
+ vmull.u8 q13, d23, d3
+ vmull.u8 q1, d22, d2
+ vmull.u8 q14, d23, d5
+ vmull.u8 q2, d22, d4
+
+ vmull.u8 q4, d7, d17
+ vmull.u8 q8, d6, d16
+ vmull.u8 q5, d7, d19
+ vmull.u8 q9, d6, d18
+ vmull.u8 q6, d7, d21
+ vmull.u8 q10, d6, d20
+
+ vqadd.u16 q12, q4
+ vqadd.u16 q0, q8
+ vqadd.u16 q13, q5
+ vqadd.u16 q1, q9
+ vqadd.u16 q14, q6
+ vqadd.u16 q2, q10
+
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q4, q12, #8
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q5, q13, #8
+ vrshr.u16 q10, q2, #8
+ vrshr.u16 q6, q14, #8
+
+ vqadd.u16 q0, q8
+ vqadd.u16 q12, q4
+ vqadd.u16 q1, q9
+ vqadd.u16 q13, q5
+ vqadd.u16 q2, q10
+ vqadd.u16 q14, q6
+
+ vqrshrn.u16 d0, q0, #8
+ vqrshrn.u16 d1, q12, #8
+ vqrshrn.u16 d2, q1, #8
+ vqrshrn.u16 d3, q13, #8
+ vqrshrn.u16 d4, q2, #8
+ vqrshrn.u16 d5, q14, #8
+.endm
+
+#define params_DST_ATOP zipped=1
+.macro blend_kernel_DST_ATOP
+ vmvn q3, q3
+
+ vmull.u8 q12, d23, d1
+ vmull.u8 q0, d22, d0
+ vmull.u8 q13, d23, d3
+ vmull.u8 q1, d22, d2
+ vmull.u8 q14, d23, d5
+ vmull.u8 q2, d22, d4
+
+ vmull.u8 q4, d7, d17
+ vmull.u8 q8, d6, d16
+ vmull.u8 q5, d7, d19
+ vmull.u8 q9, d6, d18
+ vmull.u8 q6, d7, d21
+ vmull.u8 q10, d6, d20
+
+ vqadd.u16 q12, q4
+ vqadd.u16 q0, q8
+ vqadd.u16 q13, q5
+ vqadd.u16 q1, q9
+ vqadd.u16 q14, q6
+ vqadd.u16 q2, q10
+
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q4, q12, #8
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q5, q13, #8
+ vrshr.u16 q10, q2, #8
+ vrshr.u16 q6, q14, #8
+
+ vqadd.u16 q0, q8
+ vqadd.u16 q12, q4
+ vqadd.u16 q1, q9
+ vqadd.u16 q13, q5
+ vqadd.u16 q2, q10
+ vqadd.u16 q14, q6
+
+ vqrshrn.u16 d0, q0, #8
+ vqrshrn.u16 d1, q12, #8
+ vqrshrn.u16 d2, q1, #8
+ vqrshrn.u16 d3, q13, #8
+ vqrshrn.u16 d4, q2, #8
+ vqrshrn.u16 d5, q14, #8
+
+ vmov q3, q11
+.endm
+
+#define params_MULTIPLY zipped=0
+.macro blend_kernel_MULTIPLY
+ vmull.u8 q12, d1, d17
+ vmull.u8 q0, d0, d16
+ vmull.u8 q13, d3, d19
+ vmull.u8 q1, d2, d18
+ vmull.u8 q14, d5, d21
+ vmull.u8 q2, d4, d20
+ vmull.u8 q15, d7, d23
+ vmull.u8 q3, d6, d22
+
+ vrshrn.u16 d8, q0, #8
+ vrshrn.u16 d9, q12, #8
+ vrshrn.u16 d10, q1, #8
+ vrshrn.u16 d11, q13, #8
+ vrshrn.u16 d12, q2, #8
+ vrshrn.u16 d13, q14, #8
+ vrshrn.u16 d14, q3, #8
+ vrshrn.u16 d15, q15, #8
+
+ vaddw.u8 q0, d8
+ vaddw.u8 q12, d9
+ vaddw.u8 q1, d10
+ vaddw.u8 q13, d11
+ vaddw.u8 q2, d12
+ vaddw.u8 q14, d13
+ vaddw.u8 q3, d14
+ vaddw.u8 q15, d15
+
+ vrshrn.u16 d0, q0, #8
+ vrshrn.u16 d1, q12, #8
+ vrshrn.u16 d2, q1, #8
+ vrshrn.u16 d3, q13, #8
+ vrshrn.u16 d4, q2, #8
+ vrshrn.u16 d5, q14, #8
+ vrshrn.u16 d6, q3, #8
+ vrshrn.u16 d7, q15, #8
+.endm
+
+#define params_ADD zipped=0
+.macro blend_kernel_ADD
+ vqadd.u8 q0, q0, q8
+ vqadd.u8 q1, q1, q9
+ vqadd.u8 q2, q2, q10
+ vqadd.u8 q3, q3, q11
+.endm
+
+#define params_SUBTRACT zipped=0
+.macro blend_kernel_SUBTRACT
+ vqsub.u8 q0, q0, q8
+ vqsub.u8 q1, q1, q9
+ vqsub.u8 q2, q2, q10
+ vqsub.u8 q3, q3, q11
+.endm
+
+#define params_DIFFERENCE zipped=0
+.macro blend_kernel_DIFFERENCE
+ vabd.u8 q0, q0, q8
+ vabd.u8 q1, q1, q9
+ vabd.u8 q2, q2, q10
+ vabd.u8 q3, q3, q11
+.endm
+
+#define params_XOR zipped=0
+.macro blend_kernel_XOR
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q2, q2, q10
+ veor q3, q3, q11
+.endm
+
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop. Various sections of assembly code are dropped or substituted for
+ * simpler operations if they're not needed.
+ */
+.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
+.if \nowrap
+ \kernel
+.else
+ vpush {d8-d15}
+ subs r2, #64
+ b 2f
+ .align 4
+1:
+ .if \lddst
+ .if \zipped
+ vld4.8 {d0,d2,d4,d6}, [r0]!
+ vld4.8 {d1,d3,d5,d7}, [r0]!
+ .else
+ vld1.8 {d0-d3}, [r0]!
+ vld1.8 {d4-d7}, [r0]!
+ .endif
+ sub r0, #64
+ .endif
+ .if \ldsrc
+ .if \zipped
+ vld4.8 {d16,d18,d20,d22}, [r1]!
+ vld4.8 {d17,d19,d21,d23}, [r1]!
+ .else
+ vld1.8 {d16-d19}, [r1]!
+ vld1.8 {d20-d23}, [r1]!
+ .endif
+ .endif
+ .if \pld
+ .if \lddst ; pld [r0, #192] ; .endif
+ .if \ldsrc ; pld [r1, #192] ; .endif
+ .endif
+
+ \kernel
+
+ subs r2, #64
+ .if \zipped
+ vst4.8 {d0,d2,d4,d6}, [r0]!
+ vst4.8 {d1,d3,d5,d7}, [r0]!
+ .else
+ vst1.8 {d0-d3}, [r0]!
+ vst1.8 {d4-d7}, [r0]!
+ .endif
+
+2: bge 1b
+ adds r2, #64
+ beq 2f
+
+ /* To handle the tail portion of the data (something less than 64
+ * bytes) load small power-of-two chunks into working registers. It
+ * doesn't matter where they end up in the register; the same process
+ * will store them back out using the same positions and the operations
+ * don't require data to interact with its neighbours.
+ */
+ vmov.i8 q0, #0
+ vmov.i8 q1, #0
+ vmov.i8 q2, #0
+ vmov.i8 q3, #0
+
+ vmov.i8 q8, #0
+ vmov.i8 q9, #0
+ vmov.i8 q10, #0
+ vmov.i8 q11, #0
+
+ tst r2, #32
+ beq 1f
+ .if \lddst ; vld1.64 {d4-d7}, [r0]! ; .endif
+ .if \ldsrc ; vld1.64 {d20-d23}, [r1]! ; .endif
+1: tst r2, #16
+ beq 1f
+ .if \lddst ; vld1.64 {d2-d3}, [r0]! ; .endif
+ .if \ldsrc ; vld1.64 {d18-d19}, [r1]! ; .endif
+1: tst r2, #8
+ beq 1f
+ .if \lddst ; vld1.64 {d1}, [r0]! ; .endif
+ .if \ldsrc ; vld1.64 {d17}, [r1]! ; .endif
+1: tst r2, #4
+ beq 1f
+ .if \lddst ; vld1.32 {d0[1]}, [r0]! ; .endif
+ .if \ldsrc ; vld1.32 {d16[1]}, [r1]! ; .endif
+1: tst r2, #2
+ beq 1f
+ .if \lddst ; vld1.16 {d0[1]}, [r0]! ; .endif
+ .if \ldsrc ; vld1.16 {d16[1]}, [r1]! ; .endif
+1: tst r2, #1
+ beq 1f
+ .if \lddst ; vld1.8 {d0[1]}, [r0]! ; .endif
+ .if \ldsrc ; vld1.8 {d16[1]}, [r1]! ; .endif
+1:
+ .if \lddst ; sub r0, r2 ; .endif
+
+ .if \zipped
+ /* One small impediment in the process above is that some of the load
+ * operations can't perform byte-wise structure deinterleaving at the
+ * same time as loading only part of a register. So the data is loaded
+ * linearly and unpacked manually at this point.
+ */
+ vuzp.8 q0, q1
+ vuzp.8 q2, q3
+ vuzp.8 q0, q2
+ vuzp.8 q1, q3
+
+ vuzp.8 q8, q9
+ vuzp.8 q10, q11
+ vuzp.8 q8, q10
+ vuzp.8 q9, q11
+
+ \kernel
+
+ vzip.8 q0, q2
+ vzip.8 q1, q3
+ vzip.8 q0, q1
+ vzip.8 q2, q3
+ .else
+ \kernel
+ .endif
+
+ tst r2, #32
+ beq 1f
+ vst1.64 {d4-d7}, [r0]!
+1: tst r2, #16
+ beq 1f
+ vst1.64 {d2-d3}, [r0]!
+1: tst r2, #8
+ beq 1f
+ vst1.64 {d1}, [r0]!
+1: tst r2, #4
+ beq 1f
+ vst1.32 {d0[1]}, [r0]!
+1: tst r2, #2
+ beq 1f
+ vst1.16 {d0[1]}, [r0]!
+1: tst r2, #1
+ beq 2f
+ vst1.8 {d0[1]}, [r0]!
+2: vpop {d8-d15}
+.endif
+ mov r0, #0
+ bx lr
+.endm
+
+
+/* produce list of blend_line_XX() functions; each function uses the wrap_line
+ * macro, passing it the name of the operation macro it wants along with
+ * optional parameters to remove unnecessary operations.
+ */
+#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
+ BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+
+/* int rsdIntrinsicBlend_K(
+ * uchar4 *out, // r0
+ * uchar4 const *in, // r1
+ * int slot, // r2
+ * size_t xstart, // r3
+ * size_t xend); // [sp]
+ */
+ENTRY(rsdIntrinsicBlend_K)
+ adr ip, blend_functions
+ cmp r2, #(blend_functions_end - blend_functions) >> 2
+ ldrlo ip, [ip, r2, LSL #2]
+ movhs ip, #0
+ ldr r2, [sp]
+ add r0, r3, LSL #2
+ add r1, r3, LSL #2
+ sub r2, r3
+ mov r2, r2, LSL #2
+ cmp ip, #0
+ addne ip, ip, pc
+ bxne ip
+1: mov r0, #-1
+ bx lr
+
+blend_functions:
+.set off,0
+#define BLEND_X(d, n) .rept d-off ; .word 0 ; .endr ; .word blend_line_##n-1b ; .set off, d+1 ;
+ BLEND_LIST(BLEND_X)
+#undef BLEND_X
+blend_functions_end:
+
+END(rsdIntrinsicBlend_K)
diff --git a/toolkit/Blur.cpp b/toolkit/Blur.cpp
new file mode 100644
index 0000000..a95ff43
--- /dev/null
+++ b/toolkit/Blur.cpp
@@ -0,0 +1,545 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <math.h>
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Blur"
+
+/**
+ * Blurs an image or a section of an image.
+ *
+ * Our algorithm does two passes: a vertical blur followed by an horizontal blur.
+ */
+class BlurTask : public Task {
+ // The image we're blurring.
+ const uchar* mIn;
+ // Where we store the blurred image.
+ uchar* outArray;
+ // The size of the kernel radius is limited to 25 in ScriptIntrinsicBlur.java.
+ // So, the max kernel size is 51 (= 2 * 25 + 1).
+ // Considering SSSE3 case, which requires the size is multiple of 4,
+ // at least 52 words are necessary. Values outside of the kernel should be 0.
+ float mFp[104];
+ uint16_t mIp[104];
+
+ // Working area to store the result of the vertical blur, to be used by the horizontal pass.
+ // There's one area per thread. Since the needed working area may be too large to put on the
+ // stack, we are allocating it from the heap. To avoid paying the allocation cost for each
+ // tile, we cache the scratch area here.
+ std::vector<void*> mScratch; // Pointers to the scratch areas, one per thread.
+ std::vector<size_t> mScratchSize; // The size in bytes of the scratch areas, one per thread.
+
+ // The radius of the blur, in floating point and integer format.
+ float mRadius;
+ int mIradius;
+
+ void kernelU4(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
+ uint32_t threadIndex);
+ void kernelU1(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+ void ComputeGaussianWeights();
+
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ public:
+ BlurTask(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY, size_t vectorSize,
+ uint32_t threadCount, float radius, const Restriction* restriction)
+ : Task{sizeX, sizeY, vectorSize, false, restriction},
+ mIn{in},
+ outArray{out},
+ mScratch{threadCount},
+ mScratchSize{threadCount},
+ mRadius{std::min(25.0f, radius)} {
+ ComputeGaussianWeights();
+ }
+
+ ~BlurTask() {
+ for (size_t i = 0; i < mScratch.size(); i++) {
+ if (mScratch[i]) {
+ free(mScratch[i]);
+ }
+ }
+ }
+};
+
+void BlurTask::ComputeGaussianWeights() {
+ memset(mFp, 0, sizeof(mFp));
+ memset(mIp, 0, sizeof(mIp));
+
+ // Compute gaussian weights for the blur
+ // e is the euler's number
+ float e = 2.718281828459045f;
+ float pi = 3.1415926535897932f;
+ // g(x) = (1 / (sqrt(2 * pi) * sigma)) * e ^ (-x^2 / (2 * sigma^2))
+ // x is of the form [-radius .. 0 .. radius]
+ // and sigma varies with the radius.
+ // Based on some experimental radius values and sigmas,
+ // we approximately fit sigma = f(radius) as
+ // sigma = radius * 0.4 + 0.6
+ // The larger the radius gets, the more our gaussian blur
+ // will resemble a box blur since with large sigma
+ // the gaussian curve begins to lose its shape
+ float sigma = 0.4f * mRadius + 0.6f;
+
+ // Now compute the coefficients. We will store some redundant values to save
+ // some math during the blur calculations precompute some values
+ float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
+ float coeff2 = - 1.0f / (2.0f * sigma * sigma);
+
+ float normalizeFactor = 0.0f;
+ float floatR = 0.0f;
+ int r;
+ mIradius = (float)ceil(mRadius) + 0.5f;
+ for (r = -mIradius; r <= mIradius; r ++) {
+ floatR = (float)r;
+ mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
+ normalizeFactor += mFp[r + mIradius];
+ }
+
+ // Now we need to normalize the weights because all our coefficients need to add up to one
+ normalizeFactor = 1.0f / normalizeFactor;
+ for (r = -mIradius; r <= mIradius; r ++) {
+ mFp[r + mIradius] *= normalizeFactor;
+ mIp[r + mIradius] = (uint16_t)(mFp[r + mIradius] * 65536.0f + 0.5f);
+ }
+}
+
+/**
+ * Vertical blur of a uchar4 line.
+ *
+ * @param sizeY Number of cells of the input array in the vertical direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param y Coordinate of the point we're blurring.
+ * @param ptrIn Start of the input array.
+ * @param iStride The size in byte of a row of the input array.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneVU4(uint32_t sizeY, float4* out, int32_t x, int32_t y, const uchar* ptrIn,
+ int iStride, const float* gPtr, int iradius) {
+ const uchar *pi = ptrIn + x*4;
+
+ float4 blurredPixel = 0;
+ for (int r = -iradius; r <= iradius; r ++) {
+ int validY = std::max((y + r), 0);
+ validY = std::min(validY, (int)(sizeY - 1));
+ const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
+ float4 pf = convert<float4>(pvy[0]);
+ blurredPixel += pf * gPtr[0];
+ gPtr++;
+ }
+
+ out[0] = blurredPixel;
+}
+
+/**
+ * Vertical blur of a uchar1 line.
+ *
+ * @param sizeY Number of cells of the input array in the vertical direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param y Coordinate of the point we're blurring.
+ * @param ptrIn Start of the input array.
+ * @param iStride The size in byte of a row of the input array.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneVU1(uint32_t sizeY, float *out, int32_t x, int32_t y,
+ const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
+
+ const uchar *pi = ptrIn + x;
+
+ float blurredPixel = 0;
+ for (int r = -iradius; r <= iradius; r ++) {
+ int validY = std::max((y + r), 0);
+ validY = std::min(validY, (int)(sizeY - 1));
+ float pf = (float)pi[validY * iStride];
+ blurredPixel += pf * gPtr[0];
+ gPtr++;
+ }
+
+ out[0] = blurredPixel;
+}
+
+
+extern "C" void rsdIntrinsicBlurU1_K(uchar *out, uchar const *in, size_t w, size_t h,
+ size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
+extern "C" void rsdIntrinsicBlurU4_K(uchar4 *out, uchar4 const *in, size_t w, size_t h,
+ size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+extern void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr,
+ int rct, int x1, int ct);
+extern void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
+ int ct);
+extern void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
+ int ct);
+#endif
+
+/**
+ * Vertical blur of a line of RGBA, knowing that there's enough rows above and below us to avoid
+ * dealing with boundary conditions.
+ *
+ * @param out Where to store the results. This is the input to the horizontal blur.
+ * @param ptrIn The input data for this line.
+ * @param iStride The width of the input.
+ * @param gPtr The gaussian coefficients.
+ * @param ct The diameter of the blur.
+ * @param len How many cells to blur.
+ * @param usesSimd Whether this processor supports SIMD.
+ */
+static void OneVFU4(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int ct,
+ int x2, bool usesSimd) {
+ int x1 = 0;
+#if defined(ARCH_X86_HAVE_SSSE3)
+ if (usesSimd) {
+ int t = (x2 - x1);
+ t &= ~1;
+ if (t) {
+ rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
+ }
+ x1 += t;
+ out += t;
+ ptrIn += t << 2;
+ }
+#else
+ (void) usesSimd; // Avoid unused parameter warning.
+#endif
+ while(x2 > x1) {
+ const uchar *pi = ptrIn;
+ float4 blurredPixel = 0;
+ const float* gp = gPtr;
+
+ for (int r = 0; r < ct; r++) {
+ float4 pf = convert<float4>(((const uchar4 *)pi)[0]);
+ blurredPixel += pf * gp[0];
+ pi += iStride;
+ gp++;
+ }
+ out->xyzw = blurredPixel;
+ x1++;
+ out++;
+ ptrIn+=4;
+ }
+}
+
+/**
+ * Vertical blur of a line of U_8, knowing that there's enough rows above and below us to avoid
+ * dealing with boundary conditions.
+ *
+ * @param out Where to store the results. This is the input to the horizontal blur.
+ * @param ptrIn The input data for this line.
+ * @param iStride The width of the input.
+ * @param gPtr The gaussian coefficients.
+ * @param ct The diameter of the blur.
+ * @param len How many cells to blur.
+ * @param usesSimd Whether this processor supports SIMD.
+ */
+static void OneVFU1(float* out, const uchar* ptrIn, int iStride, const float* gPtr, int ct, int len,
+ bool usesSimd) {
+ int x1 = 0;
+
+ while((len > x1) && (((uintptr_t)ptrIn) & 0x3)) {
+ const uchar *pi = ptrIn;
+ float blurredPixel = 0;
+ const float* gp = gPtr;
+
+ for (int r = 0; r < ct; r++) {
+ float pf = (float)pi[0];
+ blurredPixel += pf * gp[0];
+ pi += iStride;
+ gp++;
+ }
+ out[0] = blurredPixel;
+ x1++;
+ out++;
+ ptrIn++;
+ len--;
+ }
+#if defined(ARCH_X86_HAVE_SSSE3)
+ if (usesSimd && (len > x1)) {
+ int t = (len - x1) >> 2;
+ t &= ~1;
+ if (t) {
+ rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
+ len -= t << 2;
+ ptrIn += t << 2;
+ out += t << 2;
+ }
+ }
+#else
+ (void) usesSimd; // Avoid unused parameter warning.
+#endif
+ while(len > 0) {
+ const uchar *pi = ptrIn;
+ float blurredPixel = 0;
+ const float* gp = gPtr;
+
+ for (int r = 0; r < ct; r++) {
+ float pf = (float)pi[0];
+ blurredPixel += pf * gp[0];
+ pi += iStride;
+ gp++;
+ }
+ out[0] = blurredPixel;
+ len--;
+ out++;
+ ptrIn++;
+ }
+}
+
+/**
+ * Horizontal blur of a uchar4 line.
+ *
+ * @param sizeX Number of cells of the input array in the horizontal direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param ptrIn The start of the input row from which we're indexing x.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneHU4(uint32_t sizeX, uchar4* out, int32_t x, const float4* ptrIn, const float* gPtr,
+ int iradius) {
+ float4 blurredPixel = 0;
+ for (int r = -iradius; r <= iradius; r ++) {
+ int validX = std::max((x + r), 0);
+ validX = std::min(validX, (int)(sizeX - 1));
+ float4 pf = ptrIn[validX];
+ blurredPixel += pf * gPtr[0];
+ gPtr++;
+ }
+
+ out->xyzw = convert<uchar4>(blurredPixel);
+}
+
+/**
+ * Horizontal blur of a uchar line.
+ *
+ * @param sizeX Number of cells of the input array in the horizontal direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param ptrIn The start of the input row from which we're indexing x.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneHU1(uint32_t sizeX, uchar* out, int32_t x, const float* ptrIn, const float* gPtr,
+ int iradius) {
+ float blurredPixel = 0;
+ for (int r = -iradius; r <= iradius; r ++) {
+ int validX = std::max((x + r), 0);
+ validX = std::min(validX, (int)(sizeX - 1));
+ float pf = ptrIn[validX];
+ blurredPixel += pf * gPtr[0];
+ gPtr++;
+ }
+
+ out[0] = (uchar)blurredPixel;
+}
+
+/**
+ * Full blur of a line of RGBA data.
+ *
+ * @param outPtr Where to store the results
+ * @param xstart The index of the section we're starting to blur.
+ * @param xend The end index of the section.
+ * @param currentY The index of the line we're blurring.
+ * @param usesSimd Whether this processor supports SIMD.
+ */
+void BlurTask::kernelU4(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
+ uint32_t threadIndex) {
+ float4 stackbuf[2048];
+ float4 *buf = &stackbuf[0];
+ const uint32_t stride = mSizeX * mVectorSize;
+
+ uchar4 *out = (uchar4 *)outPtr;
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+ if (mUsesSimd && mSizeX >= 4) {
+ rsdIntrinsicBlurU4_K(out, (uchar4 const *)(mIn + stride * currentY),
+ mSizeX, mSizeY,
+ stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
+ return;
+ }
+#endif
+
+ if (mSizeX > 2048) {
+ if ((mSizeX > mScratchSize[threadIndex]) || !mScratch[threadIndex]) {
+ // Pad the side of the allocation by one unit to allow alignment later
+ mScratch[threadIndex] = realloc(mScratch[threadIndex], (mSizeX + 1) * 16);
+ mScratchSize[threadIndex] = mSizeX;
+ }
+ // realloc only aligns to 8 bytes so we manually align to 16.
+ buf = (float4 *) ((((intptr_t)mScratch[threadIndex]) + 15) & ~0xf);
+ }
+ float4 *fout = (float4 *)buf;
+ int y = currentY;
+ if ((y > mIradius) && (y < ((int)mSizeY - mIradius))) {
+ const uchar *pi = mIn + (y - mIradius) * stride;
+ OneVFU4(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
+ } else {
+ x1 = 0;
+ while(mSizeX > x1) {
+ OneVU4(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
+ fout++;
+ x1++;
+ }
+ }
+
+ x1 = xstart;
+ while ((x1 < (uint32_t)mIradius) && (x1 < x2)) {
+ OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
+ out++;
+ x1++;
+ }
+#if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + mIradius) < x2) {
+ rsdIntrinsicBlurHFU4_K(out, buf - mIradius, mFp,
+ mIradius * 2 + 1, x1, x2 - mIradius);
+ out += (x2 - mIradius) - x1;
+ x1 = x2 - mIradius;
+ }
+ }
+#endif
+ while(x2 > x1) {
+ OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
+ out++;
+ x1++;
+ }
+}
+
+/**
+ * Full blur of a line of U_8 data.
+ *
+ * @param outPtr Where to store the results
+ * @param xstart The index of the section we're starting to blur.
+ * @param xend The end index of the section.
+ * @param currentY The index of the line we're blurring.
+ */
+void BlurTask::kernelU1(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ float buf[4 * 2048];
+ const uint32_t stride = mSizeX * mVectorSize;
+
+ uchar *out = (uchar *)outPtr;
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+ if (mUsesSimd && mSizeX >= 16) {
+ // The specialisation for r<=8 has an awkward prefill case, which is
+ // fiddly to resolve, where starting close to the right edge can cause
+ // a read beyond the end of input. So avoid that case here.
+ if (mIradius > 8 || (mSizeX - std::max(0, (int32_t)x1 - 8)) >= 16) {
+ rsdIntrinsicBlurU1_K(out, mIn + stride * currentY, mSizeX, mSizeY,
+ stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
+ return;
+ }
+ }
+#endif
+
+ float *fout = (float *)buf;
+ int y = currentY;
+ if ((y > mIradius) && (y < ((int)mSizeY - mIradius -1))) {
+ const uchar *pi = mIn + (y - mIradius) * stride;
+ OneVFU1(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
+ } else {
+ x1 = 0;
+ while(mSizeX > x1) {
+ OneVU1(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
+ fout++;
+ x1++;
+ }
+ }
+
+ x1 = xstart;
+ while ((x1 < x2) &&
+ ((x1 < (uint32_t)mIradius) || (((uintptr_t)out) & 0x3))) {
+ OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
+ out++;
+ x1++;
+ }
+#if defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ if ((x1 + mIradius) < x2) {
+ uint32_t len = x2 - (x1 + mIradius);
+ len &= ~3;
+
+ // rsdIntrinsicBlurHFU1_K() processes each four float values in |buf| at once, so it
+ // nees to ensure four more values can be accessed in order to avoid accessing
+ // uninitialized buffer.
+ if (len > 4) {
+ len -= 4;
+ rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - mIradius, mFp,
+ mIradius * 2 + 1, x1, x1 + len);
+ out += len;
+ x1 += len;
+ }
+ }
+ }
+#endif
+ while(x2 > x1) {
+ OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
+ out++;
+ x1++;
+ }
+}
+
+void BlurTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ for (size_t y = startY; y < endY; y++) {
+ void* outPtr = outArray + (mSizeX * y + startX) * mVectorSize;
+ if (mVectorSize == 4) {
+ kernelU4(outPtr, startX, endX, y, threadIndex);
+ } else {
+ kernelU1(outPtr, startX, endX, y);
+ }
+ }
+}
+
+void RenderScriptToolkit::blur(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY,
+ size_t vectorSize, int radius, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+ return;
+ }
+ if (radius <= 0 || radius > 25) {
+ ALOGE("The radius should be between 1 and 25. %d provided.", radius);
+ }
+ if (vectorSize != 1 && vectorSize != 4) {
+ ALOGE("The vectorSize should be 1 or 4. %zu provided.", vectorSize);
+ }
+#endif
+
+ BlurTask task(in, out, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), radius,
+ restriction);
+ processor->doTask(&task);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/Blur_advsimd.S b/toolkit/Blur_advsimd.S
new file mode 100644
index 0000000..6d3cb8d
--- /dev/null
+++ b/toolkit/Blur_advsimd.S
@@ -0,0 +1,1868 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define PRIVATE(f) .text; .align 4; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+//#define ARCH_ARM64_USE_BLUR_PRELOAD
+
+/* Number of fractional bits to preserve in intermediate results. The
+ * intermediate storage is 16-bit, and we started with 8 bit data (the integer
+ * part), so this should be between 0 and 8.
+ */
+.set FRACTION_BITS, 7
+.set MAX_R, 25
+
+
+/* A quick way of making a line of code conditional on some other condition.
+ * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
+ * `ifcc`:
+ */
+.macro ifcc zzz:vararg
+.if cc
+ \zzz
+.endif
+.endm
+
+/* It's not always clear that prefetching is beneficial and this needs further
+ * testing on different cores, so it's made switchable here.
+ */
+#if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
+#define VERTPLD(...) prfm PLDL1KEEP, [__VA_ARGS__]
+#else
+#define VERTPLD(...) nop
+#endif
+
+/* Fetch 16 columns of bytes (regardless of image format), convolve these
+ * vertically, and leave them in the register file. If working near the top or
+ * bottom of an image then clamp the addressing while loading the data in.
+ *
+ * The convolution is fully unrolled for windows up to max_r, with the
+ * outermost edges calculated first. This way it's possible to branch directly
+ * into the relevant part of the code for an arbitrary convolution radius. Two
+ * variants of the loop are produced; one eliminates the clamping code for a
+ * slight speed advantage.
+ *
+ * Where the macro is called with reg=x, the specified register is taken to
+ * contain a pre-calculated pointer into one of the two loops.
+ *
+ * Input:
+ * x1 -- src
+ * x2 -- pitch
+ * x5 -- r
+ * x6 -- rup (r, unless clipped to top of source image)
+ * x7 -- rdn (r, unless clipped to bottom of source image)
+ * x12 -- switch index
+ * v0-v3 -- coefficient table
+ * x13 = -pitch
+ * x15 = top-row in
+ * x19 = bottom-row in
+ * Output:
+ * x1 += 16
+ * v10,v11 -- 16 convolved columns
+ * Modifies:
+ * x10 = upper row pointer
+ * x11 = lower row pointer
+ * v12-v15 = temporary sums
+ */
+.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
+ .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
+
+ ld1 {v15.16b}, [x1], #16
+ mov x10, x15
+
+ uxtl v14.8h, v15.8b
+ VERTPLD(x1, #16)
+ uxtl2 v15.8h, v15.16b
+ .if \max_r < 16 // approximate
+ ifcc adr \reg, 1f
+ .else
+ ifcc adrp \reg, 1f
+ ifcc add \reg, \reg, #:lo12:1f
+ .endif
+
+ umull v12.4s, v14.4h, v0.h[0]
+ ifcc sub \reg, \reg, x5, LSL #6
+ umull2 v13.4s, v14.8h, v0.h[0]
+ mov x11, x19
+ umull v14.4s, v15.4h, v0.h[0]
+ ifcc add \reg, \reg, x5, LSL #3
+ umull2 v15.4s, v15.8h, v0.h[0]
+ br \reg
+
+ /* This version of the vertical fetch loop body is used away from the edges
+ * of the source image. The pointers start at the top and bottom source rows
+ * and work their way towards the centre on each iteration. This way the
+ * number of taps used can be controlled by jumping directly into the middle
+ * of the loop and running to completion.
+ * If the loop body changes size then the code which calculates the address of
+ * the initial iteration must be updated to accordingly.
+ */
+ .macro vertfetch_noclamp i, dreg
+ .if 0 < \i && \i <= \max_r
+ ld1 {v10.16b}, [x10], x2
+ ld1 {v11.16b}, [x11], x13
+ uaddl v16.8h, v10.8b, v11.8b
+ uaddl2 v11.8h, v10.16b, v11.16b
+ umlal v12.4s, v16.4h, \dreg
+ umlal2 v13.4s, v16.8h, \dreg
+ VERTPLD(x10, #32)
+ umlal v14.4s, v11.4h, \dreg
+ VERTPLD(x11, #32)
+ umlal2 v15.4s, v11.8h, \dreg
+ .endif
+ .endm
+
+ /* This version of the vertical fetch loop body is used near the edges of the
+ * source image, where one or both of the accesses may start with a clamped
+ * value, and the row addresses only begin to change after some number of
+ * iterations before the end.
+ * If the loop body changes size then the code which calculates the address of
+ * the initial iteration must be updated to accordingly.
+ */
+ .macro vertfetch_clamped i, dreg
+ .if 0 < \i && \i <= \max_r
+ ld1 {v10.16b}, [x10], x2
+ cmp x6, #\i
+ ld1 {v11.16b}, [x11], x13
+ csel x10, x15, x10, lo
+ uaddl v16.8h, v10.8b, v11.8b
+ cmp x7, #\i
+ uaddl2 v11.8h, v10.16b, v11.16b
+ csel x11, x19, x11, lo
+ umlal v12.4s, v16.4h, \dreg
+ umlal2 v13.4s, v16.8h, \dreg
+ VERTPLD(x10, #32)
+ umlal v14.4s, v11.4h, \dreg
+ VERTPLD(x11, #32)
+ umlal2 v15.4s, v11.8h, \dreg
+ .endif
+ .endm
+
+ /* Entry into this unrolled loop is computed as a negative index from
+ * \labelc at the end of the block.
+ */
+ .align 4
+ vertfetch_clamped 27, v3.h[3]
+ vertfetch_clamped 26, v3.h[2]
+ vertfetch_clamped 25, v3.h[1]
+ vertfetch_clamped 24, v3.h[0]
+ vertfetch_clamped 23, v2.h[7]
+ vertfetch_clamped 22, v2.h[6]
+ vertfetch_clamped 21, v2.h[5]
+ vertfetch_clamped 20, v2.h[4]
+ vertfetch_clamped 19, v2.h[3]
+ vertfetch_clamped 18, v2.h[2]
+ vertfetch_clamped 17, v2.h[1]
+ vertfetch_clamped 16, v2.h[0]
+ vertfetch_clamped 15, v1.h[7]
+ vertfetch_clamped 14, v1.h[6]
+ vertfetch_clamped 13, v1.h[5]
+ vertfetch_clamped 12, v1.h[4]
+ vertfetch_clamped 11, v1.h[3]
+ vertfetch_clamped 10, v1.h[2]
+ vertfetch_clamped 9, v1.h[1]
+ vertfetch_clamped 8, v1.h[0]
+ vertfetch_clamped 7, v0.h[7]
+ vertfetch_clamped 6, v0.h[6]
+ vertfetch_clamped 5, v0.h[5]
+ vertfetch_clamped 4, v0.h[4]
+ vertfetch_clamped 3, v0.h[3]
+ vertfetch_clamped 2, v0.h[2]
+ vertfetch_clamped 1, v0.h[1]
+ vertfetch_clamped 0, v0.h[0]
+ 1:
+ \labelc : b 2f /* done with clamped loop, skip over non-clamped loop */
+
+ /* Entry into this unrolled loop is computed as a negative index from
+ * \labelnc at the end of the block.
+ */
+ .align 4
+ vertfetch_noclamp 27, v3.h[3]
+ vertfetch_noclamp 26, v3.h[2]
+ vertfetch_noclamp 25, v3.h[1]
+ vertfetch_noclamp 24, v3.h[0]
+ vertfetch_noclamp 23, v2.h[7]
+ vertfetch_noclamp 22, v2.h[6]
+ vertfetch_noclamp 21, v2.h[5]
+ vertfetch_noclamp 20, v2.h[4]
+ vertfetch_noclamp 19, v2.h[3]
+ vertfetch_noclamp 18, v2.h[2]
+ vertfetch_noclamp 17, v2.h[1]
+ vertfetch_noclamp 16, v2.h[0]
+ vertfetch_noclamp 15, v1.h[7]
+ vertfetch_noclamp 14, v1.h[6]
+ vertfetch_noclamp 13, v1.h[5]
+ vertfetch_noclamp 12, v1.h[4]
+ vertfetch_noclamp 11, v1.h[3]
+ vertfetch_noclamp 10, v1.h[2]
+ vertfetch_noclamp 9, v1.h[1]
+ vertfetch_noclamp 8, v1.h[0]
+ vertfetch_noclamp 7, v0.h[7]
+ vertfetch_noclamp 6, v0.h[6]
+ vertfetch_noclamp 5, v0.h[5]
+ vertfetch_noclamp 4, v0.h[4]
+ vertfetch_noclamp 3, v0.h[3]
+ vertfetch_noclamp 2, v0.h[2]
+ vertfetch_noclamp 1, v0.h[1]
+ vertfetch_noclamp 0, v0.h[0]
+ \labelnc :
+
+ .purgem vertfetch_clamped
+ .purgem vertfetch_noclamp
+
+ 2: uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS
+ add x15, x15, #16
+ uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS
+ add x19, x19, #16
+ uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS
+ uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS
+.endm /*}}}*/
+
+/* Some portion of the convolution window (as much as will fit, and all of it
+ * for the uchar1 cases) is kept in the register file to avoid unnecessary
+ * memory accesses. This forces the horizontal loops to be unrolled because
+ * there's no indexed addressing into the register file.
+ *
+ * As in the fetch macro, the operations are ordered from outside to inside, so
+ * that jumping into the middle of the block bypasses the unwanted window taps.
+ *
+ * There are several variants of the macro because of the fixed offets of the
+ * taps -- the wider the maximum radius the further the centre tap is from the
+ * most recently fetched data. This means that pre-filling the window requires
+ * more data that won't be used and it means that rotating the window involves
+ * more mov operations.
+ *
+ * When the buffer gets too big the buffer at [x9] is used.
+ *
+ * Input:
+ * v16-v31,v4-v11 -- convoltion window
+ * x9 -- pointer to additional convolution window data
+ * Output:
+ * x9 -- updated buffer pointer (if used)
+ * d31 -- result to be stored
+ * Modifies:
+ * x12 -- temp buffer pointer
+ * v12-v13 -- temporaries for load and vext operations.
+ * v14-v15 -- intermediate sums
+ */
+#define TUNED_LIST1 8, 16
+.macro hconv1_8/*{{{*/
+
+.rodata
+ 200: .hword -4
+ .hword 101f-100f
+ .hword 102f-100f
+ .hword 103f-100f
+ .hword 104f-100f
+ .hword 105f-100f
+ .hword 106f-100f
+ .hword 107f-100f
+ .hword 108f-100f
+ .align 4
+.text
+ umull v14.4s, v9.4h, v0.h[0]
+ umull2 v15.4s, v9.8h, v0.h[0]
+
+ adrp x16, 200b
+ add x16, x16, :lo12:200b
+ ldrsh x12, [x16, x5, LSL #1]
+ adr x16, 100f
+ add x12, x12, x16
+ 100: br x12
+ 108: umlal v14.4s, v8.4h, v1.h[0]
+ umlal2 v15.4s, v8.8h, v1.h[0]
+ umlal v14.4s, v10.4h, v1.h[0]
+ umlal2 v15.4s, v10.8h, v1.h[0]
+ 107: ext v12.16b, v8.16b, v9.16b, #1*2
+ ext v13.16b, v9.16b, v10.16b, #7*2
+ umlal v14.4s, v12.4h, v0.h[7]
+ umlal2 v15.4s, v12.8h, v0.h[7]
+ umlal v14.4s, v13.4h, v0.h[7]
+ umlal2 v15.4s, v13.8h, v0.h[7]
+ 106: ext v12.16b, v8.16b, v9.16b, #2*2
+ ext v13.16b, v9.16b, v10.16b, #6*2
+ umlal v14.4s, v12.4h, v0.h[6]
+ umlal2 v15.4s, v12.8h, v0.h[6]
+ umlal v14.4s, v13.4h, v0.h[6]
+ umlal2 v15.4s, v13.8h, v0.h[6]
+ 105: ext v12.16b, v8.16b, v9.16b, #3*2
+ ext v13.16b, v9.16b, v10.16b, #5*2
+ umlal v14.4s, v12.4h, v0.h[5]
+ umlal2 v15.4s, v12.8h, v0.h[5]
+ umlal v14.4s, v13.4h, v0.h[5]
+ umlal2 v15.4s, v13.8h, v0.h[5]
+ 104: //ext v12.16b, v8.16b, v9.16b, #4*2
+ //ext v13.16b, v9.16b, v10.16b, #4*2
+ umlal2 v14.4s, v8.8h, v0.h[4]
+ umlal v15.4s, v9.4h, v0.h[4]
+ umlal2 v14.4s, v9.8h, v0.h[4]
+ umlal v15.4s, v10.4h, v0.h[4]
+ 103: ext v12.16b, v8.16b, v9.16b, #5*2
+ ext v13.16b, v9.16b, v10.16b, #3*2
+ umlal v14.4s, v12.4h, v0.h[3]
+ umlal2 v15.4s, v12.8h, v0.h[3]
+ umlal v14.4s, v13.4h, v0.h[3]
+ umlal2 v15.4s, v13.8h, v0.h[3]
+ 102: ext v12.16b, v8.16b, v9.16b, #6*2
+ ext v13.16b, v9.16b, v10.16b, #2*2
+ umlal v14.4s, v12.4h, v0.h[2]
+ umlal2 v15.4s, v12.8h, v0.h[2]
+ umlal v14.4s, v13.4h, v0.h[2]
+ umlal2 v15.4s, v13.8h, v0.h[2]
+ 101: ext v12.16b, v8.16b, v9.16b, #7*2
+ ext v13.16b, v9.16b, v10.16b, #1*2
+ umlal v14.4s, v12.4h, v0.h[1]
+ umlal2 v15.4s, v12.8h, v0.h[1]
+ umlal v14.4s, v13.4h, v0.h[1]
+ umlal2 v15.4s, v13.8h, v0.h[1]
+
+ uqrshrn v14.4h, v14.4s, #16
+ uqrshrn2 v14.8h, v15.4s, #16
+ uqrshrn v15.8b, v14.8h, #FRACTION_BITS
+
+ mov v8.16b, v9.16b
+ mov v9.16b, v10.16b
+ mov v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv1_16/*{{{*/
+.rodata
+ 200: .hword -4
+ .hword 101f-100f
+ .hword 102f-100f
+ .hword 103f-100f
+ .hword 104f-100f
+ .hword 105f-100f
+ .hword 106f-100f
+ .hword 107f-100f
+ .hword 108f-100f
+ .hword 109f-100f
+ .hword 110f-100f
+ .hword 111f-100f
+ .hword 112f-100f
+ .hword 113f-100f
+ .hword 114f-100f
+ .hword 115f-100f
+ .hword 116f-100f
+ .align 4
+
+.text
+ umull v14.4s, v8.4h, v0.h[0]
+ umull2 v15.4s, v8.8h, v0.h[0]
+
+ adrp x16, 200b
+ add x16, x16, :lo12:200b
+ ldrsh x12, [x16, x5, LSL #1]
+ adr x16, 100f
+ add x12, x12, x16
+ 100: br x12
+ 116: //ext v12.16b, v6.16b, v7.16b, #0*2
+ //ext v13.16b, v10.16b, v11.16b, #0*2
+ umlal v14.4s, v6.4h, v2.h[0]
+ umlal2 v15.4s, v6.8h, v2.h[0]
+ umlal v14.4s, v10.4h, v2.h[0]
+ umlal2 v15.4s, v10.8h, v2.h[0]
+ 115: ext v12.16b, v6.16b, v7.16b, #1*2
+ ext v13.16b, v9.16b, v10.16b, #7*2
+ umlal v14.4s, v12.4h, v1.h[7]
+ umlal2 v15.4s, v12.8h, v1.h[7]
+ umlal v14.4s, v13.4h, v1.h[7]
+ umlal2 v15.4s, v13.8h, v1.h[7]
+ 114: ext v12.16b, v6.16b, v7.16b, #2*2
+ ext v13.16b, v9.16b, v10.16b, #6*2
+ umlal v14.4s, v12.4h, v1.h[6]
+ umlal2 v15.4s, v12.8h, v1.h[6]
+ umlal v14.4s, v13.4h, v1.h[6]
+ umlal2 v15.4s, v13.8h, v1.h[6]
+ 113: ext v12.16b, v6.16b, v7.16b, #3*2
+ ext v13.16b, v9.16b, v10.16b, #5*2
+ umlal v14.4s, v12.4h, v1.h[5]
+ umlal2 v15.4s, v12.8h, v1.h[5]
+ umlal v14.4s, v13.4h, v1.h[5]
+ umlal2 v15.4s, v13.8h, v1.h[5]
+ 112: //ext v12.16b, v6.16b, v7.16b, #4*2
+ //ext v13.16b, v9.16b, v10.16b, #4*2
+ umlal2 v14.4s, v6.8h, v1.h[4]
+ umlal v15.4s, v7.4h, v1.h[4]
+ umlal2 v14.4s, v9.8h, v1.h[4]
+ umlal v15.4s, v10.4h, v1.h[4]
+ 111: ext v12.16b, v6.16b, v7.16b, #5*2
+ ext v13.16b, v9.16b, v10.16b, #3*2
+ umlal v14.4s, v12.4h, v1.h[3]
+ umlal2 v15.4s, v12.8h, v1.h[3]
+ umlal v14.4s, v13.4h, v1.h[3]
+ umlal2 v15.4s, v13.8h, v1.h[3]
+ 110: ext v12.16b, v6.16b, v7.16b, #6*2
+ ext v13.16b, v9.16b, v10.16b, #2*2
+ umlal v14.4s, v12.4h, v1.h[2]
+ umlal2 v15.4s, v12.8h, v1.h[2]
+ umlal v14.4s, v13.4h, v1.h[2]
+ umlal2 v15.4s, v13.8h, v1.h[2]
+ 109: ext v12.16b, v6.16b, v7.16b, #7*2
+ ext v13.16b, v9.16b, v10.16b, #1*2
+ umlal v14.4s, v12.4h, v1.h[1]
+ umlal2 v15.4s, v12.8h, v1.h[1]
+ umlal v14.4s, v13.4h, v1.h[1]
+ umlal2 v15.4s, v13.8h, v1.h[1]
+ 108: //ext v12.16b, v7.16b, v8.16b, #0*2
+ //ext v13.16b, v9.16b, v10.16b, #0*2
+ umlal v14.4s, v7.4h, v1.h[0]
+ umlal2 v15.4s, v7.8h, v1.h[0]
+ umlal v14.4s, v9.4h, v1.h[0]
+ umlal2 v15.4s, v9.8h, v1.h[0]
+ 107: ext v12.16b, v7.16b, v8.16b, #1*2
+ ext v13.16b, v8.16b, v9.16b, #7*2
+ umlal v14.4s, v12.4h, v0.h[7]
+ umlal2 v15.4s, v12.8h, v0.h[7]
+ umlal v14.4s, v13.4h, v0.h[7]
+ umlal2 v15.4s, v13.8h, v0.h[7]
+ 106: ext v12.16b, v7.16b, v8.16b, #2*2
+ ext v13.16b, v8.16b, v9.16b, #6*2
+ umlal v14.4s, v12.4h, v0.h[6]
+ umlal2 v15.4s, v12.8h, v0.h[6]
+ umlal v14.4s, v13.4h, v0.h[6]
+ umlal2 v15.4s, v13.8h, v0.h[6]
+ 105: ext v12.16b, v7.16b, v8.16b, #3*2
+ ext v13.16b, v8.16b, v9.16b, #5*2
+ umlal v14.4s, v12.4h, v0.h[5]
+ umlal2 v15.4s, v12.8h, v0.h[5]
+ umlal v14.4s, v13.4h, v0.h[5]
+ umlal2 v15.4s, v13.8h, v0.h[5]
+ 104: //ext v12.16b, v7.16b, v8.16b, #4*2
+ //ext v13.16b, v8.16b, v9.16b, #4*2
+ umlal2 v14.4s, v7.8h, v0.h[4]
+ umlal v15.4s, v8.4h, v0.h[4]
+ umlal2 v14.4s, v8.8h, v0.h[4]
+ umlal v15.4s, v9.4h, v0.h[4]
+ 103: ext v12.16b, v7.16b, v8.16b, #5*2
+ ext v13.16b, v8.16b, v9.16b, #3*2
+ umlal v14.4s, v12.4h, v0.h[3]
+ umlal2 v15.4s, v12.8h, v0.h[3]
+ umlal v14.4s, v13.4h, v0.h[3]
+ umlal2 v15.4s, v13.8h, v0.h[3]
+ 102: ext v12.16b, v7.16b, v8.16b, #6*2
+ ext v13.16b, v8.16b, v9.16b, #2*2
+ umlal v14.4s, v12.4h, v0.h[2]
+ umlal2 v15.4s, v12.8h, v0.h[2]
+ umlal v14.4s, v13.4h, v0.h[2]
+ umlal2 v15.4s, v13.8h, v0.h[2]
+ 101: ext v12.16b, v7.16b, v8.16b, #7*2
+ ext v13.16b, v8.16b, v9.16b, #1*2
+ umlal v14.4s, v12.4h, v0.h[1]
+ umlal2 v15.4s, v12.8h, v0.h[1]
+ umlal v14.4s, v13.4h, v0.h[1]
+ umlal2 v15.4s, v13.8h, v0.h[1]
+
+ uqrshrn v14.4h, v14.4s, #16
+ uqrshrn2 v14.8h, v15.4s, #16
+ uqrshrn v15.8b, v14.8h, #FRACTION_BITS
+
+ mov v6.16b, v7.16b
+ mov v7.16b, v8.16b
+ mov v8.16b, v9.16b
+ mov v9.16b, v10.16b
+ mov v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv1_25/*{{{*/
+.rodata
+ 200: .hword -4
+ .hword 101f-100f
+ .hword 102f-100f
+ .hword 103f-100f
+ .hword 104f-100f
+ .hword 105f-100f
+ .hword 106f-100f
+ .hword 107f-100f
+ .hword 108f-100f
+ .hword 109f-100f
+ .hword 110f-100f
+ .hword 111f-100f
+ .hword 112f-100f
+ .hword 113f-100f
+ .hword 114f-100f
+ .hword 115f-100f
+ .hword 116f-100f
+ .hword 117f-100f
+ .hword 118f-100f
+ .hword 119f-100f
+ .hword 120f-100f
+ .hword 121f-100f
+ .hword 122f-100f
+ .hword 123f-100f
+ .hword 124f-100f
+ .hword 125f-100f
+ .align 4
+.text
+ ext v12.16b, v6.16b, v7.16b, #7*2
+ umull v14.4s, v12.4h, v0.h[0]
+ umull2 v15.4s, v12.8h, v0.h[0]
+
+ adrp x16, 200b
+ add x16, x16, :lo12:200b
+ ldrsh x12, [x16, x5, LSL #1]
+ adr x16, 100f
+ add x12, x12, x16
+ 100: br x12
+ 125: ext v12.16b, v31.16b, v4.16b, #6*2
+ ext v13.16b, v10.16b, v11.16b, #0*2
+ umlal v14.4s, v12.4h, v3.h[1]
+ umlal2 v15.4s, v12.8h, v3.h[1]
+ umlal v14.4s, v13.4h, v3.h[1]
+ umlal2 v15.4s, v13.8h, v3.h[1]
+ 124: ext v12.16b, v31.16b, v4.16b, #7*2
+ ext v13.16b, v9.16b, v10.16b, #7*2
+ umlal v14.4s, v12.4h, v3.h[0]
+ umlal2 v15.4s, v12.8h, v3.h[0]
+ umlal v14.4s, v13.4h, v3.h[0]
+ umlal2 v15.4s, v13.8h, v3.h[0]
+ 123: ext v12.16b, v4.16b, v5.16b, #0*2
+ ext v13.16b, v9.16b, v10.16b, #6*2
+ umlal v14.4s, v12.4h, v2.h[7]
+ umlal2 v15.4s, v12.8h, v2.h[7]
+ umlal v14.4s, v13.4h, v2.h[7]
+ umlal2 v15.4s, v13.8h, v2.h[7]
+ 122: ext v12.16b, v4.16b, v5.16b, #1*2
+ ext v13.16b, v9.16b, v10.16b, #5*2
+ umlal v14.4s, v12.4h, v2.h[6]
+ umlal2 v15.4s, v12.8h, v2.h[6]
+ umlal v14.4s, v13.4h, v2.h[6]
+ umlal2 v15.4s, v13.8h, v2.h[6]
+ 121: ext v12.16b, v4.16b, v5.16b, #2*2
+ ext v13.16b, v9.16b, v10.16b, #4*2
+ umlal v14.4s, v12.4h, v2.h[5]
+ umlal2 v15.4s, v12.8h, v2.h[5]
+ umlal v14.4s, v13.4h, v2.h[5]
+ umlal2 v15.4s, v13.8h, v2.h[5]
+ 120: ext v12.16b, v4.16b, v5.16b, #3*2
+ ext v13.16b, v9.16b, v10.16b, #3*2
+ umlal v14.4s, v12.4h, v2.h[4]
+ umlal2 v15.4s, v12.8h, v2.h[4]
+ umlal v14.4s, v13.4h, v2.h[4]
+ umlal2 v15.4s, v13.8h, v2.h[4]
+ 119: ext v12.16b, v4.16b, v5.16b, #4*2
+ ext v13.16b, v9.16b, v10.16b, #2*2
+ umlal v14.4s, v12.4h, v2.h[3]
+ umlal2 v15.4s, v12.8h, v2.h[3]
+ umlal v14.4s, v13.4h, v2.h[3]
+ umlal2 v15.4s, v13.8h, v2.h[3]
+ 118: ext v12.16b, v4.16b, v5.16b, #5*2
+ ext v13.16b, v9.16b, v10.16b, #1*2
+ umlal v14.4s, v12.4h, v2.h[2]
+ umlal2 v15.4s, v12.8h, v2.h[2]
+ umlal v14.4s, v13.4h, v2.h[2]
+ umlal2 v15.4s, v13.8h, v2.h[2]
+ 117: ext v12.16b, v4.16b, v5.16b, #6*2
+ ext v13.16b, v9.16b, v10.16b, #0*2
+ umlal v14.4s, v12.4h, v2.h[1]
+ umlal2 v15.4s, v12.8h, v2.h[1]
+ umlal v14.4s, v13.4h, v2.h[1]
+ umlal2 v15.4s, v13.8h, v2.h[1]
+ 116: ext v12.16b, v4.16b, v5.16b, #7*2
+ ext v13.16b, v8.16b, v9.16b, #7*2
+ umlal v14.4s, v12.4h, v2.h[0]
+ umlal2 v15.4s, v12.8h, v2.h[0]
+ umlal v14.4s, v13.4h, v2.h[0]
+ umlal2 v15.4s, v13.8h, v2.h[0]
+ 115: ext v12.16b, v5.16b, v6.16b, #0*2
+ ext v13.16b, v8.16b, v9.16b, #6*2
+ umlal v14.4s, v12.4h, v1.h[7]
+ umlal2 v15.4s, v12.8h, v1.h[7]
+ umlal v14.4s, v13.4h, v1.h[7]
+ umlal2 v15.4s, v13.8h, v1.h[7]
+ 114: ext v12.16b, v5.16b, v6.16b, #1*2
+ ext v13.16b, v8.16b, v9.16b, #5*2
+ umlal v14.4s, v12.4h, v1.h[6]
+ umlal2 v15.4s, v12.8h, v1.h[6]
+ umlal v14.4s, v13.4h, v1.h[6]
+ umlal2 v15.4s, v13.8h, v1.h[6]
+ 113: ext v12.16b, v5.16b, v6.16b, #2*2
+ ext v13.16b, v8.16b, v9.16b, #4*2
+ umlal v14.4s, v12.4h, v1.h[5]
+ umlal2 v15.4s, v12.8h, v1.h[5]
+ umlal v14.4s, v13.4h, v1.h[5]
+ umlal2 v15.4s, v13.8h, v1.h[5]
+ 112: ext v12.16b, v5.16b, v6.16b, #3*2
+ ext v13.16b, v8.16b, v9.16b, #3*2
+ umlal v14.4s, v12.4h, v1.h[4]
+ umlal2 v15.4s, v12.8h, v1.h[4]
+ umlal v14.4s, v13.4h, v1.h[4]
+ umlal2 v15.4s, v13.8h, v1.h[4]
+ 111: ext v12.16b, v5.16b, v6.16b, #4*2
+ ext v13.16b, v8.16b, v9.16b, #2*2
+ umlal v14.4s, v12.4h, v1.h[3]
+ umlal2 v15.4s, v12.8h, v1.h[3]
+ umlal v14.4s, v13.4h, v1.h[3]
+ umlal2 v15.4s, v13.8h, v1.h[3]
+ 110: ext v12.16b, v5.16b, v6.16b, #5*2
+ ext v13.16b, v8.16b, v9.16b, #1*2
+ umlal v14.4s, v12.4h, v1.h[2]
+ umlal2 v15.4s, v12.8h, v1.h[2]
+ umlal v14.4s, v13.4h, v1.h[2]
+ umlal2 v15.4s, v13.8h, v1.h[2]
+ 109: ext v12.16b, v5.16b, v6.16b, #6*2
+ ext v13.16b, v8.16b, v9.16b, #0*2
+ umlal v14.4s, v12.4h, v1.h[1]
+ umlal2 v15.4s, v12.8h, v1.h[1]
+ umlal v14.4s, v13.4h, v1.h[1]
+ umlal2 v15.4s, v13.8h, v1.h[1]
+ 108: ext v12.16b, v5.16b, v6.16b, #7*2
+ ext v13.16b, v7.16b, v8.16b, #7*2
+ umlal v14.4s, v12.4h, v1.h[0]
+ umlal2 v15.4s, v12.8h, v1.h[0]
+ umlal v14.4s, v13.4h, v1.h[0]
+ umlal2 v15.4s, v13.8h, v1.h[0]
+ 107: ext v12.16b, v6.16b, v7.16b, #0*2
+ ext v13.16b, v7.16b, v8.16b, #6*2
+ umlal v14.4s, v12.4h, v0.h[7]
+ umlal2 v15.4s, v12.8h, v0.h[7]
+ umlal v14.4s, v13.4h, v0.h[7]
+ umlal2 v15.4s, v13.8h, v0.h[7]
+ 106: ext v12.16b, v6.16b, v7.16b, #1*2
+ ext v13.16b, v7.16b, v8.16b, #5*2
+ umlal v14.4s, v12.4h, v0.h[6]
+ umlal2 v15.4s, v12.8h, v0.h[6]
+ umlal v14.4s, v13.4h, v0.h[6]
+ umlal2 v15.4s, v13.8h, v0.h[6]
+ 105: ext v12.16b, v6.16b, v7.16b, #2*2
+ ext v13.16b, v7.16b, v8.16b, #4*2
+ umlal v14.4s, v12.4h, v0.h[5]
+ umlal2 v15.4s, v12.8h, v0.h[5]
+ umlal v14.4s, v13.4h, v0.h[5]
+ umlal2 v15.4s, v13.8h, v0.h[5]
+ 104: ext v12.16b, v6.16b, v7.16b, #3*2
+ ext v13.16b, v7.16b, v8.16b, #3*2
+ umlal v14.4s, v12.4h, v0.h[4]
+ umlal2 v15.4s, v12.8h, v0.h[4]
+ umlal v14.4s, v13.4h, v0.h[4]
+ umlal2 v15.4s, v13.8h, v0.h[4]
+ 103: ext v12.16b, v6.16b, v7.16b, #4*2
+ ext v13.16b, v7.16b, v8.16b, #2*2
+ umlal v14.4s, v12.4h, v0.h[3]
+ umlal2 v15.4s, v12.8h, v0.h[3]
+ umlal v14.4s, v13.4h, v0.h[3]
+ umlal2 v15.4s, v13.8h, v0.h[3]
+ 102: ext v12.16b, v6.16b, v7.16b, #5*2
+ ext v13.16b, v7.16b, v8.16b, #1*2
+ umlal v14.4s, v12.4h, v0.h[2]
+ umlal2 v15.4s, v12.8h, v0.h[2]
+ umlal v14.4s, v13.4h, v0.h[2]
+ umlal2 v15.4s, v13.8h, v0.h[2]
+ 101: ext v12.16b, v6.16b, v7.16b, #6*2
+ ext v13.16b, v7.16b, v8.16b, #0*2
+ umlal v14.4s, v12.4h, v0.h[1]
+ umlal2 v15.4s, v12.8h, v0.h[1]
+ umlal v14.4s, v13.4h, v0.h[1]
+ umlal2 v15.4s, v13.8h, v0.h[1]
+
+ uqrshrn v14.4h, v14.4s, #16
+ uqrshrn2 v14.8h, v15.4s, #16
+ uqrshrn v15.8b, v14.8h, #FRACTION_BITS
+
+ mov v31.16b, v4.16b
+ mov v4.16b, v5.16b
+ mov v5.16b, v6.16b
+ mov v6.16b, v7.16b
+ mov v7.16b, v8.16b
+ mov v8.16b, v9.16b
+ mov v9.16b, v10.16b
+ mov v10.16b, v11.16b
+.endm/*}}}*/
+
+#define TUNED_LIST4 6, 12, 20
+.macro hconv4_6/*{{{*/
+.rodata
+ 200: .hword -4
+ .hword 101f-100f
+ .hword 102f-100f
+ .hword 103f-100f
+ .hword 104f-100f
+ .hword 105f-100f
+ .hword 106f-100f
+ .align 4
+.text
+ umull v14.4s, v7.4h, v0.h[0]
+ umull2 v15.4s, v7.8h, v0.h[0]
+
+ adrp x16, 200b
+ add x16, x16, :lo12:200b
+ ldrsh x12, [x16, x5, LSL #1]
+ adr x16, 100f
+ add x12, x12, x16
+ 100: br x12
+ 106: umlal v14.4s, v4.4h, v0.h[6]
+ umlal2 v15.4s, v4.8h, v0.h[6]
+ umlal v14.4s, v10.4h, v0.h[6]
+ umlal2 v15.4s, v10.8h, v0.h[6]
+ 105: umlal2 v14.4s, v4.8h, v0.h[5]
+ umlal v15.4s, v5.4h, v0.h[5]
+ umlal2 v14.4s, v9.8h, v0.h[5]
+ umlal v15.4s, v10.4h, v0.h[5]
+ 104: umlal v14.4s, v5.4h, v0.h[4]
+ umlal2 v15.4s, v5.8h, v0.h[4]
+ umlal v14.4s, v9.4h, v0.h[4]
+ umlal2 v15.4s, v9.8h, v0.h[4]
+ 103: umlal2 v14.4s, v5.8h, v0.h[3]
+ umlal v15.4s, v6.4h, v0.h[3]
+ umlal2 v14.4s, v8.8h, v0.h[3]
+ umlal v15.4s, v9.4h, v0.h[3]
+ 102: umlal v14.4s, v6.4h, v0.h[2]
+ umlal2 v15.4s, v6.8h, v0.h[2]
+ umlal v14.4s, v8.4h, v0.h[2]
+ umlal2 v15.4s, v8.8h, v0.h[2]
+ 101: umlal2 v14.4s, v6.8h, v0.h[1]
+ umlal v15.4s, v7.4h, v0.h[1]
+ umlal2 v14.4s, v7.8h, v0.h[1]
+ umlal v15.4s, v8.4h, v0.h[1]
+
+ uqrshrn v14.4h, v14.4s, #16
+ uqrshrn2 v14.8h, v15.4s, #16
+ uqrshrn v15.8b, v14.8h, #FRACTION_BITS
+
+ mov v4.16b, v5.16b
+ mov v5.16b, v6.16b
+ mov v6.16b, v7.16b
+ mov v7.16b, v8.16b
+ mov v8.16b, v9.16b
+ mov v9.16b, v10.16b
+ mov v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv4_12/*{{{*/
+.rodata
+ 200: .hword -4 //Might need to remove these...
+ .hword 101f-100f
+ .hword 102f-100f
+ .hword 103f-100f
+ .hword 104f-100f
+ .hword 105f-100f
+ .hword 106f-100f
+ .hword 107f-100f
+ .hword 108f-100f
+ .hword 109f-100f
+ .hword 110f-100f
+ .hword 111f-100f
+ .hword 112f-100f
+ .align 4
+.text
+ umull v14.4s, v4.4h, v0.h[0]
+ umull2 v15.4s, v4.8h, v0.h[0]
+
+ adrp x16, 200b
+ add x16, x16, :lo12:200b
+ ldrsh x12, [x16, x5, LSL #1]
+ adr x16, 100f
+ add x12, x12, x16
+ 100: br x12
+ 112: umlal v14.4s, v26.4h, v1.h[4]
+ umlal2 v15.4s, v26.8h, v1.h[4]
+ umlal v14.4s, v10.4h, v1.h[4]
+ umlal2 v15.4s, v10.8h, v1.h[4]
+ 111: umlal2 v14.4s, v26.8h, v1.h[3]
+ umlal v15.4s, v27.4h, v1.h[3]
+ umlal2 v14.4s, v9.8h, v1.h[3]
+ umlal v15.4s, v10.4h, v1.h[3]
+ 110: umlal v14.4s, v27.4h, v1.h[2]
+ umlal2 v15.4s, v27.8h, v1.h[2]
+ umlal v14.4s, v9.4h, v1.h[2]
+ umlal2 v15.4s, v9.8h, v1.h[2]
+ 109: umlal2 v14.4s, v27.8h, v1.h[1]
+ umlal v15.4s, v28.4h, v1.h[1]
+ umlal2 v14.4s, v8.8h, v1.h[1]
+ umlal v15.4s, v9.4h, v1.h[1]
+ 108: umlal v14.4s, v28.4h, v1.h[0]
+ umlal2 v15.4s, v28.8h, v1.h[0]
+ umlal v14.4s, v8.4h, v1.h[0]
+ umlal2 v15.4s, v8.8h, v1.h[0]
+ 107: umlal2 v14.4s, v28.8h, v0.h[7]
+ umlal v15.4s, v29.4h, v0.h[7]
+ umlal2 v14.4s, v7.8h, v0.h[7]
+ umlal v15.4s, v8.4h, v0.h[7]
+ 106: umlal v14.4s, v29.4h, v0.h[6]
+ umlal2 v15.4s, v29.8h, v0.h[6]
+ umlal v14.4s, v7.4h, v0.h[6]
+ umlal2 v15.4s, v7.8h, v0.h[6]
+ 105: umlal2 v14.4s, v29.8h, v0.h[5]
+ umlal v15.4s, v30.4h, v0.h[5]
+ umlal2 v14.4s, v6.8h, v0.h[5]
+ umlal v15.4s, v7.4h, v0.h[5]
+ 104: umlal v14.4s, v30.4h, v0.h[4]
+ umlal2 v15.4s, v30.8h, v0.h[4]
+ umlal v14.4s, v6.4h, v0.h[4]
+ umlal2 v15.4s, v6.8h, v0.h[4]
+ 103: umlal2 v14.4s, v30.8h, v0.h[3]
+ umlal v15.4s, v31.4h, v0.h[3]
+ umlal2 v14.4s, v5.8h, v0.h[3]
+ umlal v15.4s, v6.4h, v0.h[3]
+ 102: umlal v14.4s, v31.4h, v0.h[2]
+ umlal2 v15.4s, v31.8h, v0.h[2]
+ umlal v14.4s, v5.4h, v0.h[2]
+ umlal2 v15.4s, v5.8h, v0.h[2]
+ 101: umlal2 v14.4s, v31.8h, v0.h[1]
+ umlal v15.4s, v4.4h, v0.h[1]
+ umlal2 v14.4s, v4.8h, v0.h[1]
+ umlal v15.4s, v5.4h, v0.h[1]
+
+ uqrshrn v14.4h, v14.4s, #16
+ uqrshrn2 v14.8h, v15.4s, #16
+ uqrshrn v15.8b, v14.8h, #FRACTION_BITS
+
+ mov v26.16b, v27.16b
+ mov v27.16b, v28.16b
+ mov v28.16b, v29.16b
+ mov v29.16b, v30.16b
+ mov v30.16b, v31.16b
+ mov v31.16b, v4.16b
+ mov v4.16b, v5.16b
+ mov v5.16b, v6.16b
+ mov v6.16b, v7.16b
+ mov v7.16b, v8.16b
+ mov v8.16b, v9.16b
+ mov v9.16b, v10.16b
+ mov v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv4_20/*{{{*/
+.rodata
+ 200: .hword -4
+ .hword 101f-100f
+ .hword 102f-100f
+ .hword 103f-100f
+ .hword 104f-100f
+ .hword 105f-100f
+ .hword 106f-100f
+ .hword 107f-100f
+ .hword 108f-100f
+ .hword 109f-100f
+ .hword 110f-100f
+ .hword 111f-100f
+ .hword 112f-100f
+ .hword 113f-100f
+ .hword 114f-100f
+ .hword 115f-100f
+ .hword 116f-100f
+ .hword 117f-100f
+ .hword 118f-100f
+ .hword 119f-100f
+ .hword 120f-100f
+ .align 4
+.text
+ umull v14.4s, v28.4h, v0.h[0]
+ umull2 v15.4s, v28.8h, v0.h[0]
+
+ adrp x16, 200b
+ add x16, x16, :lo12:200b
+ ldrsh x12, [x16, x5, LSL #1]
+ adr x16, 100f
+ add x12, x12, x16
+ 100: br x12
+ 120: umlal v14.4s, v18.4h, v2.h[4]
+ umlal2 v15.4s, v18.8h, v2.h[4]
+ umlal v14.4s, v10.4h, v2.h[4]
+ umlal2 v15.4s, v10.8h, v2.h[4]
+ 119: umlal2 v14.4s, v18.8h, v2.h[3]
+ umlal v15.4s, v19.4h, v2.h[3]
+ umlal2 v14.4s, v9.8h, v2.h[3]
+ umlal v15.4s, v10.4h, v2.h[3]
+ 118: umlal v14.4s, v19.4h, v2.h[2]
+ umlal2 v15.4s, v19.8h, v2.h[2]
+ umlal v14.4s, v9.4h, v2.h[2]
+ umlal2 v15.4s, v9.8h, v2.h[2]
+ 117: umlal2 v14.4s, v19.8h, v2.h[1]
+ umlal v15.4s, v20.4h, v2.h[1]
+ umlal2 v14.4s, v8.8h, v2.h[1]
+ umlal v15.4s, v9.4h, v2.h[1]
+ 116: umlal v14.4s, v20.4h, v2.h[0]
+ umlal2 v15.4s, v20.8h, v2.h[0]
+ umlal v14.4s, v8.4h, v2.h[0]
+ umlal2 v15.4s, v8.8h, v2.h[0]
+ 115: umlal2 v14.4s, v20.8h, v1.h[7]
+ umlal v15.4s, v21.4h, v1.h[7]
+ umlal2 v14.4s, v7.8h, v1.h[7]
+ umlal v15.4s, v8.4h, v1.h[7]
+ 114: umlal v14.4s, v21.4h, v1.h[6]
+ umlal2 v15.4s, v21.8h, v1.h[6]
+ umlal v14.4s, v7.4h, v1.h[6]
+ umlal2 v15.4s, v7.8h, v1.h[6]
+ 113: umlal2 v14.4s, v21.8h, v1.h[5]
+ umlal v15.4s, v22.4h, v1.h[5]
+ umlal2 v14.4s, v6.8h, v1.h[5]
+ umlal v15.4s, v7.4h, v1.h[5]
+ 112: umlal v14.4s, v22.4h, v1.h[4]
+ umlal2 v15.4s, v22.8h, v1.h[4]
+ umlal v14.4s, v6.4h, v1.h[4]
+ umlal2 v15.4s, v6.8h, v1.h[4]
+ 111: umlal2 v14.4s, v22.8h, v1.h[3]
+ umlal v15.4s, v23.4h, v1.h[3]
+ umlal2 v14.4s, v5.8h, v1.h[3]
+ umlal v15.4s, v6.4h, v1.h[3]
+ 110: umlal v14.4s, v23.4h, v1.h[2]
+ umlal2 v15.4s, v23.8h, v1.h[2]
+ umlal v14.4s, v5.4h, v1.h[2]
+ umlal2 v15.4s, v5.8h, v1.h[2]
+ 109: umlal2 v14.4s, v23.8h, v1.h[1]
+ umlal v15.4s, v24.4h, v1.h[1]
+ umlal2 v14.4s, v4.8h, v1.h[1]
+ umlal v15.4s, v5.4h, v1.h[1]
+ 108: umlal v14.4s, v24.4h, v1.h[0]
+ umlal2 v15.4s, v24.8h, v1.h[0]
+ umlal v14.4s, v4.4h, v1.h[0]
+ umlal2 v15.4s, v4.8h, v1.h[0]
+ 107: umlal2 v14.4s, v24.8h, v0.h[7]
+ umlal v15.4s, v25.4h, v0.h[7]
+ umlal2 v14.4s, v31.8h, v0.h[7]
+ umlal v15.4s, v4.4h, v0.h[7]
+ 106: umlal v14.4s, v25.4h, v0.h[6]
+ umlal2 v15.4s, v25.8h, v0.h[6]
+ umlal v14.4s, v31.4h, v0.h[6]
+ umlal2 v15.4s, v31.8h, v0.h[6]
+ 105: umlal2 v14.4s, v25.8h, v0.h[5]
+ umlal v15.4s, v26.4h, v0.h[5]
+ umlal2 v14.4s, v30.8h, v0.h[5]
+ umlal v15.4s, v31.4h, v0.h[5]
+ 104: umlal v14.4s, v26.4h, v0.h[4]
+ umlal2 v15.4s, v26.8h, v0.h[4]
+ umlal v14.4s, v30.4h, v0.h[4]
+ umlal2 v15.4s, v30.8h, v0.h[4]
+ 103: umlal2 v14.4s, v26.8h, v0.h[3]
+ umlal v15.4s, v27.4h, v0.h[3]
+ umlal2 v14.4s, v29.8h, v0.h[3]
+ umlal v15.4s, v30.4h, v0.h[3]
+ 102: umlal v14.4s, v27.4h, v0.h[2]
+ umlal2 v15.4s, v27.8h, v0.h[2]
+ umlal v14.4s, v29.4h, v0.h[2]
+ umlal2 v15.4s, v29.8h, v0.h[2]
+ 101: umlal2 v14.4s, v27.8h, v0.h[1]
+ umlal v15.4s, v28.4h, v0.h[1]
+ umlal2 v14.4s, v28.8h, v0.h[1]
+ umlal v15.4s, v29.4h, v0.h[1]
+
+ uqrshrn v14.4h, v14.4s, #16
+ uqrshrn2 v14.8h, v15.4s, #16
+ uqrshrn v15.8b, v14.8h, #FRACTION_BITS
+
+ mov v18.16b, v19.16b
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ mov v21.16b, v22.16b
+ mov v22.16b, v23.16b
+ mov v23.16b, v24.16b
+ mov v24.16b, v25.16b
+ mov v25.16b, v26.16b
+ mov v26.16b, v27.16b
+ mov v27.16b, v28.16b
+ mov v28.16b, v29.16b
+ mov v29.16b, v30.16b
+ mov v30.16b, v31.16b
+ mov v31.16b, v4.16b
+ mov v4.16b, v5.16b
+ mov v5.16b, v6.16b
+ mov v6.16b, v7.16b
+ mov v7.16b, v8.16b
+ mov v8.16b, v9.16b
+ mov v9.16b, v10.16b
+ mov v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv4_25/*{{{*/
+.rodata
+ 200: .hword -4
+ .hword 101f-100f
+ .hword 102f-100f
+ .hword 103f-100f
+ .hword 104f-100f
+ .hword 105f-100f
+ .hword 106f-100f
+ .hword 107f-100f
+ .hword 108f-100f
+ .hword 109f-100f
+ .hword 110f-100f
+ .hword 111f-100f
+ .hword 112f-100f
+ .hword 113f-100f
+ .hword 114f-100f
+ .hword 115f-100f
+ .hword 116f-100f
+ .hword 117f-100f
+ .hword 118f-100f
+ .hword 119f-100f
+ .hword 120f-100f
+ .hword 121f-100f
+ .hword 122f-100f
+ .hword 123f-100f
+ .hword 124f-100f
+ .hword 125f-100f
+ .align 4
+.text
+ umull2 v14.4s, v25.8h, v0.h[0]
+ umull v15.4s, v26.4h, v0.h[0]
+
+ adrp x16, 200b
+ add x16, x16, :lo12:200b
+ ldrsh x12, [x16, x5, LSL #1]
+ adr x16, 100f
+ add x12, x12, x16
+ 100: br x12
+ 125: ld1 {v12.8h}, [x9]
+ umlal v14.4s, v12.4h, v3.h[1]
+ umlal2 v15.4s, v12.8h, v3.h[1]
+ umlal v14.4s, v10.4h, v3.h[1]
+ umlal2 v15.4s, v10.8h, v3.h[1]
+ 124: add x12, x9, #0x08
+ bic x12, x12, #0x40
+ ld1 {v12.4h}, [x12], #8
+ bic x12, x12, #0x40
+ ld1 {v13.4h}, [x12]
+ umlal v14.4s, v12.4h, v3.h[0]
+ umlal v15.4s, v13.4h, v3.h[0]
+ umlal2 v14.4s, v9.8h, v3.h[0]
+ umlal v15.4s, v10.4h, v3.h[0]
+ 123: add x12, x9, #0x10
+ bic x12, x12, #0x40
+ ld1 {v12.8h}, [x12]
+ umlal v14.4s, v12.4h, v2.h[7]
+ umlal2 v15.4s, v12.8h, v2.h[7]
+ umlal v14.4s, v9.4h, v2.h[7]
+ umlal2 v15.4s, v9.8h, v2.h[7]
+ 122: add x12, x9, #0x18
+ bic x12, x12, #0x40
+ ld1 {v12.4h}, [x12], #8
+ bic x12, x12, #0x40
+ ld1 {v13.4h}, [x12]
+ umlal v14.4s, v12.4h, v2.h[6]
+ umlal v15.4s, v13.4h, v2.h[6]
+ umlal2 v14.4s, v8.8h, v2.h[6]
+ umlal v15.4s, v9.4h, v2.h[6]
+ 121: add x12, x9, #0x20
+ bic x12, x12, #0x40
+ ld1 {v12.8h}, [x12]
+ umlal v14.4s, v12.4h, v2.h[5]
+ umlal2 v15.4s, v12.8h, v2.h[5]
+ umlal v14.4s, v8.4h, v2.h[5]
+ umlal2 v15.4s, v8.8h, v2.h[5]
+ 120: add x12, x9, #0x28
+ bic x12, x12, #0x40
+ ld1 {v12.4h}, [x12], #8
+ bic x12, x12, #0x40
+ ld1 {v13.4h}, [x12]
+ umlal v14.4s, v12.4h, v2.h[4]
+ umlal v15.4s, v13.4h, v2.h[4]
+ umlal2 v14.4s, v7.8h, v2.h[4]
+ umlal v15.4s, v8.4h, v2.h[4]
+ 119: add x12, x9, #0x30
+ bic x12, x12, #0x40
+ ld1 {v12.8h}, [x12]
+ umlal v14.4s, v12.4h, v2.h[3]
+ umlal2 v15.4s, v12.8h, v2.h[3]
+ umlal v14.4s, v7.4h, v2.h[3]
+ umlal2 v15.4s, v7.8h, v2.h[3]
+ 118: add x12, x9, #0x38
+ bic x12, x12, #0x40
+ ld1 {v12.4h}, [x12]
+ umlal v14.4s, v12.4h, v2.h[2]
+ umlal v15.4s, v17.4h, v2.h[2]
+ umlal2 v14.4s, v6.8h, v2.h[2]
+ umlal v15.4s, v7.4h, v2.h[2]
+ 117: umlal v14.4s, v17.4h, v2.h[1]
+ umlal2 v15.4s, v17.8h, v2.h[1]
+ umlal v14.4s, v6.4h, v2.h[1]
+ umlal2 v15.4s, v6.8h, v2.h[1]
+ 116: umlal2 v14.4s, v17.8h, v2.h[0]
+ umlal v15.4s, v18.4h, v2.h[0]
+ umlal2 v14.4s, v5.8h, v2.h[0]
+ umlal v15.4s, v6.4h, v2.h[0]
+ 115: umlal v14.4s, v18.4h, v1.h[7]
+ umlal2 v15.4s, v18.8h, v1.h[7]
+ umlal v14.4s, v5.4h, v1.h[7]
+ umlal2 v15.4s, v5.8h, v1.h[7]
+ 114: umlal2 v14.4s, v18.8h, v1.h[6]
+ umlal v15.4s, v19.4h, v1.h[6]
+ umlal2 v14.4s, v4.8h, v1.h[6]
+ umlal v15.4s, v5.4h, v1.h[6]
+ 113: umlal v14.4s, v19.4h, v1.h[5]
+ umlal2 v15.4s, v19.8h, v1.h[5]
+ umlal v14.4s, v4.4h, v1.h[5]
+ umlal2 v15.4s, v4.8h, v1.h[5]
+ 112: umlal2 v14.4s, v19.8h, v1.h[4]
+ umlal v15.4s, v20.4h, v1.h[4]
+ umlal2 v14.4s, v31.8h, v1.h[4]
+ umlal v15.4s, v4.4h, v1.h[4]
+ 111: umlal v14.4s, v20.4h, v1.h[3]
+ umlal2 v15.4s, v20.8h, v1.h[3]
+ umlal v14.4s, v31.4h, v1.h[3]
+ umlal2 v15.4s, v31.8h, v1.h[3]
+ 110: umlal2 v14.4s, v20.8h, v1.h[2]
+ umlal v15.4s, v21.4h, v1.h[2]
+ umlal2 v14.4s, v30.8h, v1.h[2]
+ umlal v15.4s, v31.4h, v1.h[2]
+ 109: umlal v14.4s, v21.4h, v1.h[1]
+ umlal2 v15.4s, v21.8h, v1.h[1]
+ umlal v14.4s, v30.4h, v1.h[1]
+ umlal2 v15.4s, v30.8h, v1.h[1]
+ 108: umlal2 v14.4s, v21.8h, v1.h[0]
+ umlal v15.4s, v22.4h, v1.h[0]
+ umlal2 v14.4s, v29.8h, v1.h[0]
+ umlal v15.4s, v30.4h, v1.h[0]
+ 107: umlal v14.4s, v22.4h, v0.h[7]
+ umlal2 v15.4s, v22.8h, v0.h[7]
+ umlal v14.4s, v29.4h, v0.h[7]
+ umlal2 v15.4s, v29.8h, v0.h[7]
+ 106: umlal2 v14.4s, v22.8h, v0.h[6]
+ umlal v15.4s, v23.4h, v0.h[6]
+ umlal2 v14.4s, v28.8h, v0.h[6]
+ umlal v15.4s, v29.4h, v0.h[6]
+ 105: umlal v14.4s, v23.4h, v0.h[5]
+ umlal2 v15.4s, v23.8h, v0.h[5]
+ umlal v14.4s, v28.4h, v0.h[5]
+ umlal2 v15.4s, v28.8h, v0.h[5]
+ 104: umlal2 v14.4s, v23.8h, v0.h[4]
+ umlal v15.4s, v24.4h, v0.h[4]
+ umlal2 v14.4s, v27.8h, v0.h[4]
+ umlal v15.4s, v28.4h, v0.h[4]
+ 103: umlal v14.4s, v24.4h, v0.h[3]
+ umlal2 v15.4s, v24.8h, v0.h[3]
+ umlal v14.4s, v27.4h, v0.h[3]
+ umlal2 v15.4s, v27.8h, v0.h[3]
+ 102: umlal2 v14.4s, v24.8h, v0.h[2]
+ umlal v15.4s, v25.4h, v0.h[2]
+ umlal2 v14.4s, v26.8h, v0.h[2]
+ umlal v15.4s, v27.4h, v0.h[2]
+ 101: umlal v14.4s, v25.4h, v0.h[1]
+ umlal2 v15.4s, v25.8h, v0.h[1]
+ umlal v14.4s, v26.4h, v0.h[1]
+ umlal2 v15.4s, v26.8h, v0.h[1]
+
+ uqrshrn v14.4h, v14.4s, #16
+ uqrshrn2 v14.8h, v15.4s, #16
+ uqrshrn v15.8b, v14.8h, #FRACTION_BITS
+
+ st1 {v17.16b}, [x9], #16
+ bic x9, x9, #0x40
+ mov v17.16b, v18.16b
+ mov v18.16b, v19.16b
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ mov v21.16b, v22.16b
+ mov v22.16b, v23.16b
+ mov v23.16b, v24.16b
+ mov v24.16b, v25.16b
+ mov v25.16b, v26.16b
+ mov v26.16b, v27.16b
+ mov v27.16b, v28.16b
+ mov v28.16b, v29.16b
+ mov v29.16b, v30.16b
+ mov v30.16b, v31.16b
+ mov v31.16b, v4.16b
+ mov v4.16b, v5.16b
+ mov v5.16b, v6.16b
+ mov v6.16b, v7.16b
+ mov v7.16b, v8.16b
+ mov v8.16b, v9.16b
+ mov v9.16b, v10.16b
+ mov v10.16b, v11.16b
+.endm/*}}}*/
+
+/* Dedicated function wrapper for the fetch macro, for the cases where
+ * performance isn't that important, to keep code size down.
+ */
+PRIVATE(fetch_generic_asm)
+ stp x10, x11, [sp, #-16]!
+ fetch
+ ldp x10, x11, [sp], #16
+ ret
+END(fetch_generic_asm)
+
+
+/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory
+ * beyond that limit, and filling the rest of the vector with the last legal
+ * pixel.
+ * Result is in v10 and v11. v8 and v9 are filled with the first legal pixel.
+ * Note: This function can read beyond the right edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampleft1)
+ stp x29, x30, [sp, #-16]!
+ bl fetch_generic_asm
+ dup v8.8h, v10.h[0]
+ dup v9.8h, v10.h[0]
+ ands x12, x10, #15
+ beq 1f
+ sub x1, x1, x12
+ sub x15, x15, x12
+ sub x19, x19, x12
+ sub x10, x10, x12
+ sub x12, sp, x12, LSL #1
+ sub sp, sp, #64
+ sub x12, x12, #32
+ st1 {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
+ ld1 {v10.8h,v11.8h}, [x12]
+ add sp, sp, #64
+1: ldp x29, x30, [sp], #16
+ ret
+END(fetch_clampleft1)
+
+PRIVATE(fetch_clampleft4)
+ stp x29, x30, [sp, #-16]!
+ bl fetch_generic_asm
+ dup v8.2d, v10.d[0]
+ dup v9.2d, v10.d[0]
+ ands x12, x10, #15
+ beq 1f
+ sub x1, x1, x12
+ sub x15, x15, x12
+ sub x19, x19, x12
+ sub x10, x10, x12
+ sub x12, sp, x12, LSL #1
+ sub sp, sp, #64
+ sub x12, x12, #32
+ st1 {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
+ ld1 {v10.8h,v11.8h}, [x12]
+ add sp, sp, #64
+1: ldp x29, x30, [sp], #16
+ ret
+END(fetch_clampleft4)
+
+/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding
+ * reading memory beyond that limit, and filling the rest of the vector with
+ * the last legal pixel.
+ * Result is in v10 and v11. v12 and v13 are filled with the last legal pixel.
+ * Note: This function can read beyond the left edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampright1)
+ stp x29, x30, [sp, #-16]!
+ sub x12, xzr, x11
+ ands x12, x12, #15
+ beq 1f
+ sub x1, x1, x12
+ sub x15, x15, x12
+ sub x19, x19, x12
+ bl fetch_generic_asm
+ dup v12.8h, v11.h[7]
+ dup v13.8h, v11.h[7]
+ sub x12, xzr, x11
+ and x12, x12, #15
+ sub sp, sp, #64
+ add x12, sp, x12, LSL #1
+ st1 {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
+ ld1 {v10.8h,v11.8h}, [x12]
+ add sp, sp, #64
+ ldp x29, x30, [sp], #16
+ ret
+1: bl fetch_generic_asm
+ dup v12.8h, v11.h[7]
+ dup v13.8h, v11.h[7]
+ ldp x29, x30, [sp], #16
+ ret
+END(fetch_clampright1)
+
+PRIVATE(fetch_clampright4)
+ stp x29, x30, [sp, #-16]!
+ sub x12, xzr, x11
+ ands x12, x12, #15
+ beq 1f
+ sub x1, x1, x12
+ sub x15, x15, x12
+ sub x19, x19, x12
+ bl fetch_generic_asm
+ dup v12.2d, v11.d[1]
+ dup v13.2d, v11.d[1]
+ sub x12, xzr, x11
+ and x12, x12, #15
+ sub sp, sp, #64
+ add x12, sp, x12, LSL #1
+ st1 {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
+ ld1 {v10.8h,v11.8h}, [x12]
+ add sp, sp, #64
+ ldp x29, x30, [sp], #16
+ ret
+1: bl fetch_generic_asm
+ dup v12.2d, v11.d[1]
+ dup v13.2d, v11.d[1]
+ ldp x29, x30, [sp], #16
+ ret
+END(fetch_clampright4)
+
+/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th
+ * value across to fill the rest of the register pair. Used for filling the
+ * right hand edge of the window when reading too close to the right hand edge
+ * of the image.
+ * Also returns a dup-ed copy of the last element in v12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
+ */
+PRIVATE(prefill_sweepright1)
+ ands x12, x11, #15
+ beq 1f
+ sub x12, x12, #1
+ sub sp, sp, #64
+ st1 {v10.8h,v11.8h}, [sp]
+ add x12, sp, x12, LSL #1
+ ld1r {v12.8h}, [x12]
+ ld1r {v13.8h}, [x12]
+ st1 {v12.8h,v13.8h}, [x12]
+ ld1 {v10.8h,v11.8h}, [sp]
+ add sp, sp, #64
+ ret
+1: dup v12.8h, v11.h[7]
+ dup v13.8h, v11.h[7]
+ ret
+END(prefill_sweepright1)
+
+PRIVATE(prefill_sweepright4)
+ ands x12, x11, #15
+ beq 1f
+ sub x12, x12, #4
+ sub sp, sp, #64
+ st1 {v10.8h,v11.8h}, [sp]
+ add x12, sp, x12, LSL #1
+ ld1r {v12.2d}, [x12]
+ st1 {v13.8h}, [x12]
+ ld1 {v10.8h,v11.8h}, [sp]
+ add sp, sp, #64
+ ret
+1: dup v12.2d, v11.d[1]
+ dup v13.2d, v11.d[1]
+ ret
+END(prefill_sweepright4)
+
+/* The main loop keeps a sliding window of data that has already been convolved
+ * in the vertical axis for the current line. This usually stays in the
+ * register file, but spills to memory for large windows. The first thing that
+ * needs to be done at start-up is to fill this window with image data, taking
+ * into account the padding needed if the left or right edges of the image fall
+ * within this window.
+ */
+
+/* Because the window is in the register file writes to it cannot be indexed
+ * by another register. Consequently the fill loops are unrolled to address
+ * the registers directly. This macro distinguishes between writes to the
+ * register file and writes to the spill buffer (indicated by a destination
+ * register named xx).
+ */
+.macro prefill_out ra, rb, sra, srb
+ .ifc \ra,xx
+ .ifc \rb,xx
+ st1 {\sra,\srb}, [x9], #32
+ .else
+ bic x9, x9, #0x40
+ st1 {\sra}, [x9], #16
+ mov \rb, \srb
+ .endif
+ .else
+ .ifnc \ra,\sra
+ mov \ra, \sra
+ .endif
+ .ifnc \rb,\srb
+ mov \rb, \srb
+ .endif
+ .endif
+.endm
+
+/* This macro provides the list of registers representing the window, and the
+ * cases where the register file is too small and a spill buffer is used
+ * instead.
+ * Since several specialisations of each function are generated, this also
+ * culls superfluous iterations, and sets the variable `i` for subsequent
+ * macros indicating the current index into the window.
+ */
+.macro prefill_list, macro, nextmacro, max_r, step, label
+ .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
+ .if windowsize >= (\line * 16)
+ .set i, windowsize - (\line * 16)
+\label\macro\line:
+ prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
+ .endif
+ .endm
+ ifneeded \macro \nextmacro, 13, 12, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 12, 11, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 11, 10, xx, v17.16b, \step, \label
+ ifneeded \macro \nextmacro, 10, 9, v18.16b, v19.16b, \step, \label
+ ifneeded \macro \nextmacro, 9, 8, v20.16b, v21.16b, \step, \label
+ ifneeded \macro \nextmacro, 8, 7, v22.16b, v23.16b, \step, \label
+ ifneeded \macro \nextmacro, 7, 6, v24.16b, v25.16b, \step, \label
+ ifneeded \macro \nextmacro, 6, 5, v26.16b, v27.16b, \step, \label
+ ifneeded \macro \nextmacro, 5, 4, v28.16b, v29.16b, \step, \label
+ ifneeded \macro \nextmacro, 4, 3, v30.16b, v31.16b, \step, \label
+ ifneeded \macro \nextmacro, 3, 2, v4.16b, v5.16b, \step, \label
+ ifneeded \macro \nextmacro, 2, 1, v6.16b, v7.16b, \step, \label
+ ifneeded \macro \nextmacro, 1, 0, v8.16b, v9.16b, \step, \label
+\label\macro\()0:
+ b \label\()_end
+ .purgem ifneeded
+.endm
+
+/* These macros represent the possible stages of filling the window.
+ * Each macro is unrolled enough times that it can fill the entire window
+ * itself, but normally it will have to hand control to subsequent macros
+ * part-way through and this is done using labels named \next and \after, where
+ * \next is the next macro starting at the same window position and \after is
+ * the next macro starting after the current window position.
+ */
+
+/* leftfill: v8 and v9 contain the left padding value. While the window
+ * extends outside of the image on the left-hand side, and at least 16 more
+ * padding values are needed in the window, store v8 and v9 into the window.
+ * Otherwise skip forward to storing image data.
+ */
+.macro prefill_leftfill, next, after, ra, rb, step
+ cmp x10, #i+16
+ blo \next
+ prefill_out \ra, \rb, v8.16b, v9.16b
+.endm
+
+/* leftedge: The very first non-fill or partial-fill chunk from the image is
+ * already loaded (as it was used to calculate the left padding value), so
+ * store it here, and then drop into the regular load/store cycle in the next
+ * macro.
+ */
+.macro prefill_leftedge, next, after, ra, rb, step
+1: prefill_out \ra, \rb, v10.16b, v11.16b
+ b \after
+.endm
+
+/* dofetch: Copy chunks of the image into the window without any complications
+ * from edge conditions.
+ */
+.macro prefill_dofetch, next, after, ra, rb, step
+ cmp x11, #i+16
+ bls \next
+ bl fetch_generic_asm
+ prefill_out \ra, \rb, v10.16b, v11.16b
+.endm
+
+/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
+ * the right-hand edge of the image. In that case sweep the last valid pixel
+ * across the rest of the chunk, and in either case prepare padding data in v12
+ * and v13 for the next macro. This is done in fetch_clampright.
+ * This only happens once before going on to the next macro.
+ * Sometimes leftedge also covers the rightedge case, in which case this has
+ * to be skipped altogether.
+ */
+.macro prefill_rightedge, next, after, ra, rb, step
+ cmp x11, #i
+ bls \next
+ bl fetch_clampright\step
+ prefill_out \ra, \rb, v10.16b, v11.16b
+ b \after
+.endm
+
+/* rightfill: The rest of the window is simply filled with right padding from
+ * v12 and v13.
+ */
+.macro prefill_rightfill, next, after, ra, rb, step
+ prefill_out \ra, \rb, v12.16b, v13.16b
+.endm
+
+/* Here all of the macros above are unrolled and laid out in the proper order.
+ */
+.macro prefill_body, max_r, step, label
+ prefill_list leftfill, leftedge, \max_r, \step, \label
+ prefill_list leftedge, dofetch, \max_r, \step, \label
+ prefill_list dofetch, rightedge, \max_r, \step, \label
+ prefill_list rightedge, rightfill, \max_r, \step, \label
+ prefill_list rightfill, oops, \max_r, \step, \label
+\label\()_end:
+.endm
+
+
+/* Fill the convolution window with context data. The aim here is to load
+ * exactly 2*r columns, and in the main loop to read as many columns as will be
+ * written. This is complicated by the window being divided into chunks at
+ * register boundaries, and the need to handle cases when the input starts very
+ * close to the left or right (or both) edges of the image and the need to fill
+ * the spaces that leaves with left and right edge padding values.
+ *
+ * Input:
+ * x1 -- src
+ * x2 -- pitch
+ * x3 -- count
+ * x4 -- available image data right of src pointer
+ * x5 -- r
+ * x6 -- rup
+ * x7 -- rdn
+ * x8 -- available image data left of src pointer
+ * x9 -- buffer (if needed)
+ * x13 = -pitch
+ * x15 = top-row in
+ * x19 = bottom-row in
+ * Output:
+ * x4 -= min(inlen, count + windowsize - centertap)
+ * x1 += min(inlen, count + windowsize - centertap)
+ * x15 += min(inlen, count + windowsize - centertap)
+ * x19 += min(inlen, count + windowsize - centertap)
+ * Modifies:
+ * x10 -- fill start index in the window
+ * x11 -- fill stop index in the window
+ * x12 -- scratch
+ */
+.macro prefill step=1, max_r=25, label=xx
+.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
+.set centertap, (windowsize - \max_r * \step)
+ mov x10, #centertap
+ subs x10, x10, x8
+ csel x10, xzr, x10, lo
+
+ subs x11, x4, #windowsize - centertap
+ csel x11, xzr, x11, hs
+ add x11, x11, #windowsize
+
+ /* x10 indicates where in the window legal image data begins.
+ * x11 indicates where in the window legal image date ends.
+ * When starting near the centre of a large image these would be
+ * zero and windowsize respectively, but when starting near the
+ * edges this can change.
+ * When starting on the leftmost pixel, x10 will be centertap.
+ * When starting on the rightmost pixel, x11 will be centertap+1.
+ */
+
+ /* x4 indicates how much data there is between the current pointers
+ * and the right edge of the image. The pointers currently point
+ * to the data needed at centertap. The subsequent code will
+ * consume (windowsize - x10) data, but only the data from
+ * centertap to windowsize comes out of x4's budget.
+ */
+1: subs x4, x4, #windowsize - centertap
+ csel x4, xzr, x4, lo
+
+ /* And the pointers need to rewind to the start of the window.
+ */
+ sub x1, x1, #centertap
+ sub x15, x15, #centertap
+ sub x19, x19, #centertap
+
+ /* Unless x8 indicated that there wasn't that much data available.
+ */
+ add x1, x1, x10
+ add x15, x15, x10
+ add x19, x19, x10
+
+ /* Get the first chunk, and add padding to align it to the window
+ * if necessary.
+ */
+ bl fetch_clampleft\step
+
+ /* Sometimes the start and the end of the window are in the same
+ * chunk. In that case both ends need filler at the outset.
+ */
+ sub x12, x11, #1
+ eor x12, x10, x12
+ cmp x12, #16
+ bhs 1f
+ bl prefill_sweepright\step
+
+ /* Iterate through all the points in the window and fill them in
+ * with padding or image data as needed.
+ */
+1: prefill_body \max_r, \step, \label
+.endm
+
+/* The main body of the convolve functions. Having already pre-filled the
+ * convolution window with 2*r input values, the logic settles into a regular
+ * pattern of reading and writing at a 1:1 rate until either input or output
+ * expires. The input leads the output by r values, so when processing all the
+ * way to the right-hand edge, or within r pixels of that edge, the input will
+ * run out first. In the case of very narrow images, or sub-windows starting
+ * near the right edge, the input may already have run out while the
+ * convolution window was being filled and this loop will start with a
+ * zero-length input.
+ *
+ * Once the input runs out, the rest of the output must be processed by padding
+ * the remainder of the window with pad value from the last valid pixel from
+ * the source.
+ *
+ * Input:
+ * x0 = dst
+ * x1 = src
+ * x2 = pitch
+ * x3 = count
+ * x4 = inlen
+ * x5 = r
+ * x6 = rup
+ * x7 = rdn
+ * x9 = buffer
+ * x13 = -pitch
+ * x15 = top-row in
+ * x19 = bottom-row in
+ * Modifies
+ * x8 = fetch code pointer
+ */
+.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
+
+ /* If x4 >= x3 then there's no need for clipping. The main loop
+ * needs to exit when either x3 or x4 runs out, so clamp x4 to be
+ * no greater than x3 and use x4 for the loop.
+ * However, if x4 comes out of the loop with less than 16 bytes
+ * left, a partial read would be necessary to avoid reading beyond
+ * the end of the image. To avoid this, clamp x4 to the next
+ * multiple of 16, which is still sufficient to force it out of the
+ * loop but doesn't imply a rewind.
+ */
+ add x12, x3, #15
+ bic x12, x12, #15
+ cmp x4, x12
+ csel x4, x12, x4, hi
+
+ /* First calculate the entry-point into the internal fetch logic.
+ * This is done so the same function can service several kernel
+ * sizes.
+ */
+ adrp x8, \labelnc
+ add x8, x8, #:lo12:\labelnc
+ sub x8, x8, x5, LSL #5
+ sub x8, x8, x5, LSL #3
+ cmp x5, x6
+ ccmp x5, x7, #0, eq
+ beq 5f
+
+ /* if (r != rup || r != rdn) then the address-clamping table should
+ * be used rather than the short-cut version.
+ */
+ adrp x8, \labelc
+ add x8, x8, #:lo12:\labelc
+ sub x8, x8, x5, LSL #6
+ add x8, x8, x5, LSL #3
+ b 5f
+
+ /* Main loop: ... */
+ .align 4
+3: /* first perform a vertical convolution from memory to get the next
+ * 16 taps of the horizontal window into the register file...
+ */
+ fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
+
+ /* ...then perform a horizontal convolution on that window to
+ * produce eight output bytes, and slide the window along.
+ * This has to be done twice to match the 16-way vertical pass.
+ * It would be preferable to have twice the work done in \core, but
+ * that would demand yet another variant on those macros and would
+ * perturb the register allocation severely.
+ */
+ \core
+ st1 {v15.8b}, [x0], #8
+ \core
+ st1 {v15.8b}, [x0], #8
+
+ sub x3, x3, #16
+5: subs x4, x4, #16
+ bhi 3b
+ /* Here there's 16 or fewer bytes available before the edge of the
+ * source image. x4 holds that count minus 16 (because it was
+ * decremented before the first iteration ran). The last read may
+ * not be a whole chunk, and beyond that a fill value must be used.
+ *
+ * Of course, none of that matters if there's no more output to
+ * produce...
+ */
+ cbz x3, 5f
+
+ /* Oh well. */
+ adds x4, x4, #16
+ bne 1f
+ .if \step==1
+ dup v10.8h, v9.h[7]
+ dup v11.8h, v9.h[7]
+ .else
+ dup v10.2d, v9.d[1]
+ dup v11.2d, v9.d[1]
+ .endif
+ b 3f
+
+ /* To avoid reading past end of input, rewind pointers by (16-x4)
+ * to ensure that they're exactly 16 bytes from the edge.
+ */
+1: mov x11, x4
+ bl fetch_clampright\step
+ /* Now to put this padding to use, perform any remaining
+ * iterations. This is done at half the rate of the main loop,
+ * because there's no longer pressure from a 16-lane window filler.
+ */
+3: \core
+ .if \step==1
+ dup v11.8h, v11.h[7]
+ .else
+ dup v11.2d, v11.d[1]
+ .endif
+ subs x3, x3, #8
+ blo 4f
+ st1 {v15.8b}, [x0], #8
+ bne 3b
+ b 5f
+
+ /* If the final iteration contained 0 < l < 8 values, then perform
+ * a piecewise store of the final vector.
+ */
+4: tbz x3, #2, 1f
+ st1 {v15.s}[0], [x0], #4
+ ext v15.8b, v15.8b, v15.8b, #4
+1: tbz x3, #1, 1f
+ st1 {v15.h}[0], [x0], #2
+ ext v15.8b, v15.8b, v15.8b, #2
+1: tbz x3, #0, 5f
+ st1 {v15.b}[0], [x0], #1
+ ext v15.8b, v15.8b, v15.8b, #1
+5: mov x0, #0
+.endm
+
+
+.irp r, TUNED_LIST1, 25
+PRIVATE(convolve1_\r)
+ stp x29,x30, [sp, #-16]!
+
+ prefill step=1, max_r=\r, label=.Lcnv1_\r
+
+ conv_body core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
+
+ ldp x29,x30, [sp], #16
+ ret
+END(convolve1_\r)
+.endr
+
+.irp r, TUNED_LIST4, 25
+PRIVATE(convolve4_\r)
+ sub x9, sp, #0x40
+ stp x29,x30, [sp, #-(16 + 0x40 + 0x80)]!
+ bic x9, x9, #0x7f
+
+ /* x9 now points to a 0x40 byte buffer on the stack whose address
+ * has the low 7 bits clear. This allows easy address calculation
+ * in the wrap-around cases.
+ */
+
+ prefill step=4, max_r=\r, label=.Lcnv4_\r
+
+ conv_body core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
+
+ ldp x29,x30, [sp], #(16 + 0x40 + 0x80)
+ ret
+END(convolve4_\r)
+.endr
+
+/* void rsdIntrinsicBlurU1_K(
+ * void *out, // x0
+ * void *in, // x1
+ * size_t w, // x2
+ * size_t h, // x3
+ * size_t p, // x4
+ * size_t x, // x5
+ * size_t y, // x6
+ * size_t count, // x7
+ * size_t r, // [sp]
+ * uint16_t *tab); // [sp,#8]
+ */
+ENTRY(rsdIntrinsicBlurU1_K)
+ stp x19,x30, [sp, #-16]!
+ sub x8, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x8]
+ mov x8, x5 // x
+ ldr w5, [sp,#80] // r
+ sub x9, x2, x8 // w - x
+ sub x10, x3, x6 // h - y
+ mov x2, x4 // pitch
+ mov x3, x7 // count
+ sub x7, x10, #1 // h - y - 1
+ mov x4, x9 // inlen = (w - x)
+
+ ldr x12, [sp, #88] // tab
+
+ add x1, x1, x8 // src += x
+
+ cmp x6, x5
+ csel x6, x5, x6, hs // rup = min(r, y)
+ cmp x7, x5
+ csel x7, x5, x7, hs // rdn = min(r, h - y - 1)
+
+ sub x13, xzr, x2 // -pitch
+ msub x15, x2, x6, x1
+ madd x19, x2, x7, x1
+
+ ld1 {v0.8h,v1.8h}, [x12], #32
+ ld1 {v2.8h,v3.8h}, [x12], #32
+
+ adr x30, 1f
+ .irp r, TUNED_LIST1
+ cmp x5, #\r
+ bls convolve1_\r
+ .endr
+ b convolve1_25
+
+1: ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ldp x19,x30, [sp], #16
+ ret
+END(rsdIntrinsicBlurU1_K)
+
+/* void rsdIntrinsicBlurU4_K(
+ * void *out, // x0
+ * void *in, // x1
+ * size_t w, // x2
+ * size_t h, // x3
+ * size_t p, // x4
+ * size_t x, // x5
+ * size_t y, // x6
+ * size_t count, // x7
+ * size_t r, // [sp]
+ * uint16_t *tab); // [sp,#8]
+ */
+ENTRY(rsdIntrinsicBlurU4_K)
+ stp x19,x30, [sp, #-16]!
+ sub x8, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x8]
+ lsl x8, x5, #2 // x
+ lsl x2, x2, #2
+ ldr w5, [sp,#80] // r
+ sub x9, x2, x8 // w - x
+ sub x10, x3, x6 // h - y
+ mov x2, x4 // pitch
+ lsl x3, x7, #2 // count
+ sub x7, x10, #1 // h - y - 1
+ mov x4, x9 // inlen = (w - x)
+
+ ldr x12, [sp, #88]
+
+ add x1, x1, x8 // in += x
+
+ cmp x6, x5
+ csel x6, x5, x6, hs // rup = min(r, y)
+ cmp x7, x5
+ csel x7, x5, x7, hs // rdn = min(r, h - y - 1)
+
+
+ sub x13, xzr, x2
+ msub x15, x2, x6, x1
+ madd x19, x2, x7, x1
+
+ ld1 {v0.8h,v1.8h}, [x12], #32
+ ld1 {v2.8h,v3.8h}, [x12], #32
+
+ adr x30, 1f
+ .irp r, TUNED_LIST4
+ cmp x5, #\r
+ bls convolve4_\r
+ .endr
+ b convolve4_25
+
+1: ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ldp x19,x30, [sp], #16
+ ret
+END(rsdIntrinsicBlurU4_K)
diff --git a/toolkit/Blur_neon.S b/toolkit/Blur_neon.S
new file mode 100644
index 0000000..241af5f
--- /dev/null
+++ b/toolkit/Blur_neon.S
@@ -0,0 +1,1824 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+#define ARCH_ARM_USE_BLUR_PRELOAD
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Number of fractional bits to preserve in intermediate results. The
+ * intermediate storage is 16-bit, and we started with 8 bit data (the integer
+ * part), so this should be between 0 and 8.
+ */
+.set FRACTION_BITS, 7
+
+.set MAX_R, 25
+
+
+/* A quick way of making a line of code conditional on some other condition.
+ * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
+ * `ifcc`:
+ */
+.macro ifcc zzz:vararg
+.if cc
+ \zzz
+.endif
+.endm
+
+/* It's not always clear that prefetching is beneficial and this needs further
+ * testing on different cores, so it's made switchable here.
+ */
+#if defined(ARCH_ARM_USE_BLUR_PRELOAD)
+#define VERTPLD(...) pld [__VA_ARGS__]
+#else
+#define VERTPLD(...) nop
+#endif
+
+/* Fetch 16 columns of bytes (regardless of image format), convolve these
+ * vertically, and leave them in the register file. If working near the top or
+ * bottom of an image then clamp the addressing while loading the data in.
+ *
+ * The convolution is fully unrolled for windows up to max_r, with the
+ * outermost edges calculated first. This way it's possible to branch directly
+ * into the relevant part of the code for an arbitrary convolution radius. Two
+ * variants of the loop are produced; one eliminates the clamping code for a
+ * slight speed advantage.
+ *
+ * Where the macro is called with reg=x, the specified register is taken to
+ * contain a pre-calculated pointer into one of the two loops.
+ *
+ * Input:
+ * r1 -- src
+ * r2 -- pitch
+ * r5 -- r
+ * r6 -- rup (r, unless clipped to top of source image)
+ * r7 -- rdn (r, unless clipped to bottom of source image)
+ * r12 -- switch index
+ * q0-q3 -- coefficient table
+ * Output:
+ * r1 += 16
+ * q10,q11 -- 16 convolved columns
+ * Modifies:
+ * r10 = upper row pointer
+ * r11 = lower row pointer
+ * q12-q15 = temporary sums
+ */
+.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=r12 /*{{{*/
+ .ifc \reg,r12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
+
+ vld1.8 {d30,d31}, [r1]
+ mls r10, r2, r6, r1
+
+ vmovl.u8 q14, d30
+ VERTPLD(r1, #32)
+ vmovl.u8 q15, d31
+ .if \max_r < 16 // approximate
+ ifcc adr \reg, 1f
+ .else
+ ifcc ldr \reg, 2f
+1: ifcc add \reg, \reg, pc
+ .endif
+
+ vmull.u16 q12, d28, d0[0]
+ ifcc sub \reg, r5, LSL #6
+ vmull.u16 q13, d29, d0[0]
+ mla r11, r2, r7, r1
+ vmull.u16 q14, d30, d0[0]
+ add r1, r1, #16
+ vmull.u16 q15, d31, d0[0]
+ bx \reg
+
+ ifcc .align 2
+ 2: ifcc .word 1f-1b-8
+
+ /* This version of the vertical fetch loop body is used away from the edges
+ * of the source image. The pointers start at the top and bottom source rows
+ * and work their way towards the centre on each iteration. This way the
+ * number of taps used can be controlled by jumping directly into the middle
+ * of the loop and running to completion.
+ * If the loop body changes size then the code which calculates the address of
+ * the initial iteration must be updated to accordingly.
+ */
+ .macro vertfetch_noclamp i, dreg
+ .if 0 < \i && \i <= \max_r
+ vld1.8 {d20,d21}, [r10], r2
+ vld1.8 {d22,d23}, [r11]
+ sub r11, r11, r2
+ vswp d21, d22
+ VERTPLD(r10, #32)
+ vaddl.u8 q10, d20, d21
+ vaddl.u8 q11, d22, d23
+ vmlal.u16 q12, d20, \dreg
+ VERTPLD(r11, #32)
+ vmlal.u16 q13, d21, \dreg
+ vmlal.u16 q14, d22, \dreg
+ vmlal.u16 q15, d23, \dreg
+ .endif
+ .endm
+
+ /* This version of the vertical fetch loop body is used near the edges of the
+ * source image, where one or both of the accesses may start with a clamped
+ * value, and the row addresses only begin to change after some number of
+ * iterations before the end.
+ * If the loop body changes size then the code which calculates the address of
+ * the initial iteration must be updated to accordingly.
+ */
+ .macro vertfetch_clamped i, dreg
+ .if 0 < \i && \i <= \max_r
+ vld1.8 {d20,d21}, [r10]
+ vld1.8 {d22,d23}, [r11]
+ cmp r6, #\i
+ vswp d21, d22
+ VERTPLD(r10, #32)
+ vaddl.u8 q10, d20, d21
+ addhs r10, r10, r2
+ vaddl.u8 q11, d22, d23
+ cmp r7, #\i
+ vmlal.u16 q12, d20, \dreg
+ VERTPLD(r11, #32)
+ vmlal.u16 q13, d21, \dreg
+ subhs r11, r11, r2
+ vmlal.u16 q14, d22, \dreg
+ nop
+ vmlal.u16 q15, d23, \dreg
+ .endif
+ .endm
+
+ /* Entry into this unrolled loop is computed as a negative index from
+ * \labelc at the end of the block.
+ */
+ .align 4
+ vertfetch_clamped 27, d6[3]
+ vertfetch_clamped 26, d6[2]
+ vertfetch_clamped 25, d6[1]
+ vertfetch_clamped 24, d6[0]
+ vertfetch_clamped 23, d5[3]
+ vertfetch_clamped 22, d5[2]
+ vertfetch_clamped 21, d5[1]
+ vertfetch_clamped 20, d5[0]
+ vertfetch_clamped 19, d4[3]
+ vertfetch_clamped 18, d4[2]
+ vertfetch_clamped 17, d4[1]
+ vertfetch_clamped 16, d4[0]
+ vertfetch_clamped 15, d3[3]
+ vertfetch_clamped 14, d3[2]
+ vertfetch_clamped 13, d3[1]
+ vertfetch_clamped 12, d3[0]
+ vertfetch_clamped 11, d2[3]
+ vertfetch_clamped 10, d2[2]
+ vertfetch_clamped 9, d2[1]
+ vertfetch_clamped 8, d2[0]
+ vertfetch_clamped 7, d1[3]
+ vertfetch_clamped 6, d1[2]
+ vertfetch_clamped 5, d1[1]
+ vertfetch_clamped 4, d1[0]
+ vertfetch_clamped 3, d0[3]
+ vertfetch_clamped 2, d0[2]
+ vertfetch_clamped 1, d0[1]
+ vertfetch_clamped 0, d0[0]
+ 1:
+ \labelc : b 2f /* done with clamped loop, skip over non-clamped loop */
+
+ /* Entry into this unrolled loop is computed as a negative index from
+ * \labelnc at the end of the block.
+ */
+ .align 4
+ vertfetch_noclamp 27, d6[3]
+ vertfetch_noclamp 26, d6[2]
+ vertfetch_noclamp 25, d6[1]
+ vertfetch_noclamp 24, d6[0]
+ vertfetch_noclamp 23, d5[3]
+ vertfetch_noclamp 22, d5[2]
+ vertfetch_noclamp 21, d5[1]
+ vertfetch_noclamp 20, d5[0]
+ vertfetch_noclamp 19, d4[3]
+ vertfetch_noclamp 18, d4[2]
+ vertfetch_noclamp 17, d4[1]
+ vertfetch_noclamp 16, d4[0]
+ vertfetch_noclamp 15, d3[3]
+ vertfetch_noclamp 14, d3[2]
+ vertfetch_noclamp 13, d3[1]
+ vertfetch_noclamp 12, d3[0]
+ vertfetch_noclamp 11, d2[3]
+ vertfetch_noclamp 10, d2[2]
+ vertfetch_noclamp 9, d2[1]
+ vertfetch_noclamp 8, d2[0]
+ vertfetch_noclamp 7, d1[3]
+ vertfetch_noclamp 6, d1[2]
+ vertfetch_noclamp 5, d1[1]
+ vertfetch_noclamp 4, d1[0]
+ vertfetch_noclamp 3, d0[3]
+ vertfetch_noclamp 2, d0[2]
+ vertfetch_noclamp 1, d0[1]
+ vertfetch_noclamp 0, d0[0]
+ \labelnc :
+
+ .purgem vertfetch_clamped
+ .purgem vertfetch_noclamp
+
+ 2: vqrshrn.u32 d20, q12, #16 - FRACTION_BITS
+ vqrshrn.u32 d21, q13, #16 - FRACTION_BITS
+ vqrshrn.u32 d22, q14, #16 - FRACTION_BITS
+ vqrshrn.u32 d23, q15, #16 - FRACTION_BITS
+.endm /*}}}*/
+
+/* Some portion of the convolution window (as much as will fit, and all of it
+ * for the uchar1 cases) is kept in the register file to avoid unnecessary
+ * memory accesses. This forces the horizontal loops to be unrolled because
+ * there's no indexed addressing into the register file.
+ *
+ * As in the fetch macro, the operations are ordered from outside to inside, so
+ * that jumping into the middle of the block bypasses the unwanted window taps.
+ *
+ * There are several variants of the macro because of the fixed offets of the
+ * taps -- the wider the maximum radius the further the centre tap is from the
+ * most recently fetched data. This means that pre-filling the window requires
+ * more data that won't be used and it means that rotating the window involves
+ * more mov operations.
+ *
+ * When the buffer gets too big the buffer at [r9] is used.
+ *
+ * Input:
+ * q4-q11 -- convoltion window
+ * r9 -- pointer to additional convolution window data
+ * Output:
+ * r9 -- updated buffer pointer (if used)
+ * d31 -- result to be stored
+ * Modifies:
+ * r12 -- temp buffer pointer
+ * q12-q13 -- temporaries for load and vext operations.
+ * q14-q15 -- intermediate sums
+ */
+#define TUNED_LIST1 8, 16
+.macro hconv1_8/*{{{*/
+ vmull.u16 q14, d18, d0[0]
+ vmull.u16 q15, d19, d0[0]
+
+ ldr r12, [pc, r5, LSL #2]
+ add pc, pc, r12
+ bkpt
+ 100: .word 101f-100b
+ .word 102f-100b
+ .word 103f-100b
+ .word 104f-100b
+ .word 105f-100b
+ .word 106f-100b
+ .word 107f-100b
+ .word 108f-100b
+ 108: vmlal.u16 q14, d16, d2[0]
+ vmlal.u16 q15, d17, d2[0]
+ vmlal.u16 q14, d20, d2[0]
+ vmlal.u16 q15, d21, d2[0]
+ 107: vext.u16 q12, q8, q9, #1
+ vext.u16 q13, q9, q10, #7
+ vmlal.u16 q14, d24, d1[3]
+ vmlal.u16 q15, d25, d1[3]
+ vmlal.u16 q14, d26, d1[3]
+ vmlal.u16 q15, d27, d1[3]
+ 106: vext.u16 q12, q8, q9, #2
+ vext.u16 q13, q9, q10, #6
+ vmlal.u16 q14, d24, d1[2]
+ vmlal.u16 q15, d25, d1[2]
+ vmlal.u16 q14, d26, d1[2]
+ vmlal.u16 q15, d27, d1[2]
+ 105: vext.u16 q12, q8, q9, #3
+ vext.u16 q13, q9, q10, #5
+ vmlal.u16 q14, d24, d1[1]
+ vmlal.u16 q15, d25, d1[1]
+ vmlal.u16 q14, d26, d1[1]
+ vmlal.u16 q15, d27, d1[1]
+ 104: //vext.u16 q12, q8, q9, #4
+ //vext.u16 q13, q9, q10, #4
+ vmlal.u16 q14, d17, d1[0]
+ vmlal.u16 q15, d18, d1[0]
+ vmlal.u16 q14, d19, d1[0]
+ vmlal.u16 q15, d20, d1[0]
+ 103: vext.u16 q12, q8, q9, #5
+ vext.u16 q13, q9, q10, #3
+ vmlal.u16 q14, d24, d0[3]
+ vmlal.u16 q15, d25, d0[3]
+ vmlal.u16 q14, d26, d0[3]
+ vmlal.u16 q15, d27, d0[3]
+ 102: vext.u16 q12, q8, q9, #6
+ vext.u16 q13, q9, q10, #2
+ vmlal.u16 q14, d24, d0[2]
+ vmlal.u16 q15, d25, d0[2]
+ vmlal.u16 q14, d26, d0[2]
+ vmlal.u16 q15, d27, d0[2]
+ 101: vext.u16 q12, q8, q9, #7
+ vext.u16 q13, q9, q10, #1
+ vmlal.u16 q14, d24, d0[1]
+ vmlal.u16 q15, d25, d0[1]
+ vmlal.u16 q14, d26, d0[1]
+ vmlal.u16 q15, d27, d0[1]
+
+ vqrshrn.u32 d28, q14, #16
+ vqrshrn.u32 d29, q15, #16
+ vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+.endm/*}}}*/
+
+.macro hconv1_16/*{{{*/
+ vmull.u16 q14, d16, d0[0]
+ vmull.u16 q15, d17, d0[0]
+
+ ldr r12, [pc, r5, LSL #2]
+ add pc, pc, r12
+ bkpt
+ 100: .word 101f-100b
+ .word 102f-100b
+ .word 103f-100b
+ .word 104f-100b
+ .word 105f-100b
+ .word 106f-100b
+ .word 107f-100b
+ .word 108f-100b
+ .word 109f-100b
+ .word 110f-100b
+ .word 111f-100b
+ .word 112f-100b
+ .word 113f-100b
+ .word 114f-100b
+ .word 115f-100b
+ .word 116f-100b
+ 116: //vext.u16 q12, q6, q7, #0
+ //vext.u16 q13, q10, q11, #0
+ vmlal.u16 q14, d12, d4[0]
+ vmlal.u16 q15, d13, d4[0]
+ vmlal.u16 q14, d20, d4[0]
+ vmlal.u16 q15, d21, d4[0]
+ 115: vext.u16 q12, q6, q7, #1
+ vext.u16 q13, q9, q10, #7
+ vmlal.u16 q14, d24, d3[3]
+ vmlal.u16 q15, d25, d3[3]
+ vmlal.u16 q14, d26, d3[3]
+ vmlal.u16 q15, d27, d3[3]
+ 114: vext.u16 q12, q6, q7, #2
+ vext.u16 q13, q9, q10, #6
+ vmlal.u16 q14, d24, d3[2]
+ vmlal.u16 q15, d25, d3[2]
+ vmlal.u16 q14, d26, d3[2]
+ vmlal.u16 q15, d27, d3[2]
+ 113: vext.u16 q12, q6, q7, #3
+ vext.u16 q13, q9, q10, #5
+ vmlal.u16 q14, d24, d3[1]
+ vmlal.u16 q15, d25, d3[1]
+ vmlal.u16 q14, d26, d3[1]
+ vmlal.u16 q15, d27, d3[1]
+ 112: //vext.u16 q12, q6, q7, #4
+ //vext.u16 q13, q9, q10, #4
+ vmlal.u16 q14, d13, d3[0]
+ vmlal.u16 q15, d14, d3[0]
+ vmlal.u16 q14, d19, d3[0]
+ vmlal.u16 q15, d20, d3[0]
+ 111: vext.u16 q12, q6, q7, #5
+ vext.u16 q13, q9, q10, #3
+ vmlal.u16 q14, d24, d2[3]
+ vmlal.u16 q15, d25, d2[3]
+ vmlal.u16 q14, d26, d2[3]
+ vmlal.u16 q15, d27, d2[3]
+ 110: vext.u16 q12, q6, q7, #6
+ vext.u16 q13, q9, q10, #2
+ vmlal.u16 q14, d24, d2[2]
+ vmlal.u16 q15, d25, d2[2]
+ vmlal.u16 q14, d26, d2[2]
+ vmlal.u16 q15, d27, d2[2]
+ 109: vext.u16 q12, q6, q7, #7
+ vext.u16 q13, q9, q10, #1
+ vmlal.u16 q14, d24, d2[1]
+ vmlal.u16 q15, d25, d2[1]
+ vmlal.u16 q14, d26, d2[1]
+ vmlal.u16 q15, d27, d2[1]
+ 108: //vext.u16 q12, q7, q8, #0
+ //vext.u16 q13, q9, q10, #0
+ vmlal.u16 q14, d14, d2[0]
+ vmlal.u16 q15, d15, d2[0]
+ vmlal.u16 q14, d18, d2[0]
+ vmlal.u16 q15, d19, d2[0]
+ 107: vext.u16 q12, q7, q8, #1
+ vext.u16 q13, q8, q9, #7
+ vmlal.u16 q14, d24, d1[3]
+ vmlal.u16 q15, d25, d1[3]
+ vmlal.u16 q14, d26, d1[3]
+ vmlal.u16 q15, d27, d1[3]
+ 106: vext.u16 q12, q7, q8, #2
+ vext.u16 q13, q8, q9, #6
+ vmlal.u16 q14, d24, d1[2]
+ vmlal.u16 q15, d25, d1[2]
+ vmlal.u16 q14, d26, d1[2]
+ vmlal.u16 q15, d27, d1[2]
+ 105: vext.u16 q12, q7, q8, #3
+ vext.u16 q13, q8, q9, #5
+ vmlal.u16 q14, d24, d1[1]
+ vmlal.u16 q15, d25, d1[1]
+ vmlal.u16 q14, d26, d1[1]
+ vmlal.u16 q15, d27, d1[1]
+ 104: //vext.u16 q12, q7, q8, #4
+ //vext.u16 q13, q8, q9, #4
+ vmlal.u16 q14, d15, d1[0]
+ vmlal.u16 q15, d16, d1[0]
+ vmlal.u16 q14, d17, d1[0]
+ vmlal.u16 q15, d18, d1[0]
+ 103: vext.u16 q12, q7, q8, #5
+ vext.u16 q13, q8, q9, #3
+ vmlal.u16 q14, d24, d0[3]
+ vmlal.u16 q15, d25, d0[3]
+ vmlal.u16 q14, d26, d0[3]
+ vmlal.u16 q15, d27, d0[3]
+ 102: vext.u16 q12, q7, q8, #6
+ vext.u16 q13, q8, q9, #2
+ vmlal.u16 q14, d24, d0[2]
+ vmlal.u16 q15, d25, d0[2]
+ vmlal.u16 q14, d26, d0[2]
+ vmlal.u16 q15, d27, d0[2]
+ 101: vext.u16 q12, q7, q8, #7
+ vext.u16 q13, q8, q9, #1
+ vmlal.u16 q14, d24, d0[1]
+ vmlal.u16 q15, d25, d0[1]
+ vmlal.u16 q14, d26, d0[1]
+ vmlal.u16 q15, d27, d0[1]
+
+ vqrshrn.u32 d28, q14, #16
+ vqrshrn.u32 d29, q15, #16
+ vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+ vmov q6, q7
+ vmov q7, q8
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+.endm/*}}}*/
+
+.macro hconv1_25/*{{{*/
+ vext.u16 q12, q6, q7, #7
+ vmull.u16 q14, d24, d0[0]
+ vmull.u16 q15, d25, d0[0]
+
+ ldr r12, [pc, r5, LSL #2]
+ add pc, pc, r12
+ bkpt
+ 100: .word 101f-100b
+ .word 102f-100b
+ .word 103f-100b
+ .word 104f-100b
+ .word 105f-100b
+ .word 106f-100b
+ .word 107f-100b
+ .word 108f-100b
+ .word 109f-100b
+ .word 110f-100b
+ .word 111f-100b
+ .word 112f-100b
+ .word 113f-100b
+ .word 114f-100b
+ .word 115f-100b
+ .word 116f-100b
+ .word 117f-100b
+ .word 118f-100b
+ .word 119f-100b
+ .word 120f-100b
+ .word 121f-100b
+ .word 122f-100b
+ .word 123f-100b
+ .word 124f-100b
+ .word 125f-100b
+ 125: vext.u16 q12, q3, q4, #6
+ vext.u16 q13, q10, q11, #0
+ vmlal.u16 q14, d24, d6[1]
+ vmlal.u16 q15, d25, d6[1]
+ vmlal.u16 q14, d26, d6[1]
+ vmlal.u16 q15, d27, d6[1]
+ 124: vext.u16 q12, q3, q4, #7
+ vext.u16 q13, q9, q10, #7
+ vmlal.u16 q14, d24, d6[0]
+ vmlal.u16 q15, d25, d6[0]
+ vmlal.u16 q14, d26, d6[0]
+ vmlal.u16 q15, d27, d6[0]
+ 123: vext.u16 q12, q4, q5, #0
+ vext.u16 q13, q9, q10, #6
+ vmlal.u16 q14, d24, d5[3]
+ vmlal.u16 q15, d25, d5[3]
+ vmlal.u16 q14, d26, d5[3]
+ vmlal.u16 q15, d27, d5[3]
+ 122: vext.u16 q12, q4, q5, #1
+ vext.u16 q13, q9, q10, #5
+ vmlal.u16 q14, d24, d5[2]
+ vmlal.u16 q15, d25, d5[2]
+ vmlal.u16 q14, d26, d5[2]
+ vmlal.u16 q15, d27, d5[2]
+ 121: vext.u16 q12, q4, q5, #2
+ vext.u16 q13, q9, q10, #4
+ vmlal.u16 q14, d24, d5[1]
+ vmlal.u16 q15, d25, d5[1]
+ vmlal.u16 q14, d26, d5[1]
+ vmlal.u16 q15, d27, d5[1]
+ 120: vext.u16 q12, q4, q5, #3
+ vext.u16 q13, q9, q10, #3
+ vmlal.u16 q14, d24, d5[0]
+ vmlal.u16 q15, d25, d5[0]
+ vmlal.u16 q14, d26, d5[0]
+ vmlal.u16 q15, d27, d5[0]
+ 119: vext.u16 q12, q4, q5, #4
+ vext.u16 q13, q9, q10, #2
+ vmlal.u16 q14, d24, d4[3]
+ vmlal.u16 q15, d25, d4[3]
+ vmlal.u16 q14, d26, d4[3]
+ vmlal.u16 q15, d27, d4[3]
+ 118: vext.u16 q12, q4, q5, #5
+ vext.u16 q13, q9, q10, #1
+ vmlal.u16 q14, d24, d4[2]
+ vmlal.u16 q15, d25, d4[2]
+ vmlal.u16 q14, d26, d4[2]
+ vmlal.u16 q15, d27, d4[2]
+ 117: vext.u16 q12, q4, q5, #6
+ vext.u16 q13, q9, q10, #0
+ vmlal.u16 q14, d24, d4[1]
+ vmlal.u16 q15, d25, d4[1]
+ vmlal.u16 q14, d26, d4[1]
+ vmlal.u16 q15, d27, d4[1]
+ 116: vext.u16 q12, q4, q5, #7
+ vext.u16 q13, q8, q9, #7
+ vmlal.u16 q14, d24, d4[0]
+ vmlal.u16 q15, d25, d4[0]
+ vmlal.u16 q14, d26, d4[0]
+ vmlal.u16 q15, d27, d4[0]
+ 115: vext.u16 q12, q5, q6, #0
+ vext.u16 q13, q8, q9, #6
+ vmlal.u16 q14, d24, d3[3]
+ vmlal.u16 q15, d25, d3[3]
+ vmlal.u16 q14, d26, d3[3]
+ vmlal.u16 q15, d27, d3[3]
+ 114: vext.u16 q12, q5, q6, #1
+ vext.u16 q13, q8, q9, #5
+ vmlal.u16 q14, d24, d3[2]
+ vmlal.u16 q15, d25, d3[2]
+ vmlal.u16 q14, d26, d3[2]
+ vmlal.u16 q15, d27, d3[2]
+ 113: vext.u16 q12, q5, q6, #2
+ vext.u16 q13, q8, q9, #4
+ vmlal.u16 q14, d24, d3[1]
+ vmlal.u16 q15, d25, d3[1]
+ vmlal.u16 q14, d26, d3[1]
+ vmlal.u16 q15, d27, d3[1]
+ 112: vext.u16 q12, q5, q6, #3
+ vext.u16 q13, q8, q9, #3
+ vmlal.u16 q14, d24, d3[0]
+ vmlal.u16 q15, d25, d3[0]
+ vmlal.u16 q14, d26, d3[0]
+ vmlal.u16 q15, d27, d3[0]
+ 111: vext.u16 q12, q5, q6, #4
+ vext.u16 q13, q8, q9, #2
+ vmlal.u16 q14, d24, d2[3]
+ vmlal.u16 q15, d25, d2[3]
+ vmlal.u16 q14, d26, d2[3]
+ vmlal.u16 q15, d27, d2[3]
+ 110: vext.u16 q12, q5, q6, #5
+ vext.u16 q13, q8, q9, #1
+ vmlal.u16 q14, d24, d2[2]
+ vmlal.u16 q15, d25, d2[2]
+ vmlal.u16 q14, d26, d2[2]
+ vmlal.u16 q15, d27, d2[2]
+ 109: vext.u16 q12, q5, q6, #6
+ vext.u16 q13, q8, q9, #0
+ vmlal.u16 q14, d24, d2[1]
+ vmlal.u16 q15, d25, d2[1]
+ vmlal.u16 q14, d26, d2[1]
+ vmlal.u16 q15, d27, d2[1]
+ 108: vext.u16 q12, q5, q6, #7
+ vext.u16 q13, q7, q8, #7
+ vmlal.u16 q14, d24, d2[0]
+ vmlal.u16 q15, d25, d2[0]
+ vmlal.u16 q14, d26, d2[0]
+ vmlal.u16 q15, d27, d2[0]
+ 107: vext.u16 q12, q6, q7, #0
+ vext.u16 q13, q7, q8, #6
+ vmlal.u16 q14, d24, d1[3]
+ vmlal.u16 q15, d25, d1[3]
+ vmlal.u16 q14, d26, d1[3]
+ vmlal.u16 q15, d27, d1[3]
+ 106: vext.u16 q12, q6, q7, #1
+ vext.u16 q13, q7, q8, #5
+ vmlal.u16 q14, d24, d1[2]
+ vmlal.u16 q15, d25, d1[2]
+ vmlal.u16 q14, d26, d1[2]
+ vmlal.u16 q15, d27, d1[2]
+ 105: vext.u16 q12, q6, q7, #2
+ vext.u16 q13, q7, q8, #4
+ vmlal.u16 q14, d24, d1[1]
+ vmlal.u16 q15, d25, d1[1]
+ vmlal.u16 q14, d26, d1[1]
+ vmlal.u16 q15, d27, d1[1]
+ 104: vext.u16 q12, q6, q7, #3
+ vext.u16 q13, q7, q8, #3
+ vmlal.u16 q14, d24, d1[0]
+ vmlal.u16 q15, d25, d1[0]
+ vmlal.u16 q14, d26, d1[0]
+ vmlal.u16 q15, d27, d1[0]
+ 103: vext.u16 q12, q6, q7, #4
+ vext.u16 q13, q7, q8, #2
+ vmlal.u16 q14, d24, d0[3]
+ vmlal.u16 q15, d25, d0[3]
+ vmlal.u16 q14, d26, d0[3]
+ vmlal.u16 q15, d27, d0[3]
+ 102: vext.u16 q12, q6, q7, #5
+ vext.u16 q13, q7, q8, #1
+ vmlal.u16 q14, d24, d0[2]
+ vmlal.u16 q15, d25, d0[2]
+ vmlal.u16 q14, d26, d0[2]
+ vmlal.u16 q15, d27, d0[2]
+ 101: vext.u16 q12, q6, q7, #6
+ vext.u16 q13, q7, q8, #0
+ vmlal.u16 q14, d24, d0[1]
+ vmlal.u16 q15, d25, d0[1]
+ vmlal.u16 q14, d26, d0[1]
+ vmlal.u16 q15, d27, d0[1]
+
+ vqrshrn.u32 d28, q14, #16
+ vqrshrn.u32 d29, q15, #16
+ vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+ vmov d7, d9
+ vmov q4, q5
+ vmov q5, q6
+ vmov q6, q7
+ vmov q7, q8
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+.endm/*}}}*/
+
+#define TUNED_LIST4 6, 12
+.macro hconv4_6/*{{{*/
+ vmull.u16 q14, d14, d0[0]
+ vmull.u16 q15, d15, d0[0]
+
+ ldr r12, [pc, r5, LSL #2]
+ add pc, pc, r12
+ bkpt
+ 100: .word 101f-100b
+ .word 102f-100b
+ .word 103f-100b
+ .word 104f-100b
+ .word 105f-100b
+ .word 106f-100b
+ 106: vmlal.u16 q14, d8, d1[2]
+ vmlal.u16 q15, d9, d1[2]
+ vmlal.u16 q14, d20, d1[2]
+ vmlal.u16 q15, d21, d1[2]
+ 105: vmlal.u16 q14, d9, d1[1]
+ vmlal.u16 q15, d10, d1[1]
+ vmlal.u16 q14, d19, d1[1]
+ vmlal.u16 q15, d20, d1[1]
+ 104: vmlal.u16 q14, d10, d1[0]
+ vmlal.u16 q15, d11, d1[0]
+ vmlal.u16 q14, d18, d1[0]
+ vmlal.u16 q15, d19, d1[0]
+ 103: vmlal.u16 q14, d11, d0[3]
+ vmlal.u16 q15, d12, d0[3]
+ vmlal.u16 q14, d17, d0[3]
+ vmlal.u16 q15, d18, d0[3]
+ 102: vmlal.u16 q14, d12, d0[2]
+ vmlal.u16 q15, d13, d0[2]
+ vmlal.u16 q14, d16, d0[2]
+ vmlal.u16 q15, d17, d0[2]
+ 101: vmlal.u16 q14, d13, d0[1]
+ vmlal.u16 q15, d14, d0[1]
+ vmlal.u16 q14, d15, d0[1]
+ vmlal.u16 q15, d16, d0[1]
+
+ vqrshrn.u32 d28, q14, #16
+ vqrshrn.u32 d29, q15, #16
+ vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+ vmov q4, q5
+ vmov q5, q6
+ vmov q6, q7
+ vmov q7, q8
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+.endm/*}}}*/
+
+.macro hconv4_12/*{{{*/
+ vmull.u16 q14, d8, d0[0]
+ vmull.u16 q15, d9, d0[0]
+
+ ldr r12, [pc, r5, LSL #2]
+ add pc, pc, r12
+ bkpt
+ 100: .word 101f-100b
+ .word 102f-100b
+ .word 103f-100b
+ .word 104f-100b
+ .word 105f-100b
+ .word 106f-100b
+ .word 107f-100b
+ .word 108f-100b
+ .word 109f-100b
+ .word 110f-100b
+ .word 111f-100b
+ .word 112f-100b
+ 112: add r12, r9, #0x1a0
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d3[0]
+ vmlal.u16 q15, d25, d3[0]
+ vmlal.u16 q14, d20, d3[0]
+ vmlal.u16 q15, d21, d3[0]
+ 111: add r12, r9, #0x1a8
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12:64]
+ vmlal.u16 q14, d24, d2[3]
+ vmlal.u16 q15, d25, d2[3]
+ vmlal.u16 q14, d19, d2[3]
+ vmlal.u16 q15, d20, d2[3]
+ 110: add r12, r9, #0x1b0
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d2[2]
+ vmlal.u16 q15, d25, d2[2]
+ vmlal.u16 q14, d18, d2[2]
+ vmlal.u16 q15, d19, d2[2]
+ 109: add r12, r9, #0x1b8
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12:64]
+ vmlal.u16 q14, d24, d2[1]
+ vmlal.u16 q15, d25, d2[1]
+ vmlal.u16 q14, d17, d2[1]
+ vmlal.u16 q15, d18, d2[1]
+ 108: add r12, r9, #0x1c0
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d2[0]
+ vmlal.u16 q15, d25, d2[0]
+ vmlal.u16 q14, d16, d2[0]
+ vmlal.u16 q15, d17, d2[0]
+ 107: add r12, r9, #0x1c8
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12:64]
+ vmlal.u16 q14, d24, d1[3]
+ vmlal.u16 q15, d25, d1[3]
+ vmlal.u16 q14, d15, d1[3]
+ vmlal.u16 q15, d16, d1[3]
+ 106: add r12, r9, #0x1d0
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d1[2]
+ vmlal.u16 q15, d25, d1[2]
+ vmlal.u16 q14, d14, d1[2]
+ vmlal.u16 q15, d15, d1[2]
+ 105: add r12, r9, #0x1d8
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12:64]
+ vmlal.u16 q14, d24, d1[1]
+ vmlal.u16 q15, d25, d1[1]
+ vmlal.u16 q14, d13, d1[1]
+ vmlal.u16 q15, d14, d1[1]
+ 104: add r12, r9, #0x1e0
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d1[0]
+ vmlal.u16 q15, d25, d1[0]
+ vmlal.u16 q14, d12, d1[0]
+ vmlal.u16 q15, d13, d1[0]
+ 103: add r12, r9, #0x1e8
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12:64]
+ vmlal.u16 q14, d24, d0[3]
+ vmlal.u16 q15, d25, d0[3]
+ vmlal.u16 q14, d11, d0[3]
+ vmlal.u16 q15, d12, d0[3]
+ 102: add r12, r9, #0x1f0
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d0[2]
+ vmlal.u16 q15, d25, d0[2]
+ vmlal.u16 q14, d10, d0[2]
+ vmlal.u16 q15, d11, d0[2]
+ 101: add r12, r9, #0x1f8
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]
+ vmlal.u16 q14, d24, d0[1]
+ vmlal.u16 q15, d8, d0[1]
+ vmlal.u16 q14, d9, d0[1]
+ vmlal.u16 q15, d10, d0[1]
+
+ vqrshrn.u32 d28, q14, #16
+ vqrshrn.u32 d29, q15, #16
+ vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+ vst1.u8 {q4}, [r9:128]!
+ bic r9, r9, #0x200
+ vmov q4, q5
+ vmov q5, q6
+ vmov q6, q7
+ vmov q7, q8
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+.endm/*}}}*/
+
+.macro hconv4_25/*{{{*/
+ add r12, r9, #0x198
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12:64]
+ vmull.u16 q14, d24, d0[0]
+ vmull.u16 q15, d25, d0[0]
+
+ ldr r12, [pc, r5, LSL #2]
+ add pc, pc, r12
+ bkpt
+ 100: .word 101f-100b
+ .word 102f-100b
+ .word 103f-100b
+ .word 104f-100b
+ .word 105f-100b
+ .word 106f-100b
+ .word 107f-100b
+ .word 108f-100b
+ .word 109f-100b
+ .word 110f-100b
+ .word 111f-100b
+ .word 112f-100b
+ .word 113f-100b
+ .word 114f-100b
+ .word 115f-100b
+ .word 116f-100b
+ .word 117f-100b
+ .word 118f-100b
+ .word 119f-100b
+ .word 120f-100b
+ .word 121f-100b
+ .word 122f-100b
+ .word 123f-100b
+ .word 124f-100b
+ .word 125f-100b
+ 125: add r12, r9, #0x0d0
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d6[1]
+ vmlal.u16 q15, d25, d6[1]
+ vmlal.u16 q14, d20, d6[1]
+ vmlal.u16 q15, d21, d6[1]
+ 124: add r12, r9, #0x0d8
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ vmlal.u16 q14, d24, d6[0]
+ vmlal.u16 q15, d25, d6[0]
+ vmlal.u16 q14, d19, d6[0]
+ vmlal.u16 q15, d20, d6[0]
+ 123: add r12, r9, #0x0e0
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d5[3]
+ vmlal.u16 q15, d25, d5[3]
+ vmlal.u16 q14, d18, d5[3]
+ vmlal.u16 q15, d19, d5[3]
+ 122: add r12, r9, #0x0e8
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ vmlal.u16 q14, d24, d5[2]
+ vmlal.u16 q15, d25, d5[2]
+ vmlal.u16 q14, d17, d5[2]
+ vmlal.u16 q15, d18, d5[2]
+ 121: add r12, r9, #0x0f0
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d5[1]
+ vmlal.u16 q15, d25, d5[1]
+ vmlal.u16 q14, d16, d5[1]
+ vmlal.u16 q15, d17, d5[1]
+ 120: add r12, r9, #0x0f8
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ vmlal.u16 q14, d24, d5[0]
+ vmlal.u16 q15, d25, d5[0]
+ vmlal.u16 q14, d15, d5[0]
+ vmlal.u16 q15, d16, d5[0]
+ 119: add r12, r9, #0x100
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d4[3]
+ vmlal.u16 q15, d25, d4[3]
+ vmlal.u16 q14, d14, d4[3]
+ vmlal.u16 q15, d15, d4[3]
+ 118: add r12, r9, #0x108
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ vmlal.u16 q14, d24, d4[2]
+ vmlal.u16 q15, d25, d4[2]
+ vmlal.u16 q14, d13, d4[2]
+ vmlal.u16 q15, d14, d4[2]
+ 117: add r12, r9, #0x110
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d4[1]
+ vmlal.u16 q15, d25, d4[1]
+ vmlal.u16 q14, d12, d4[1]
+ vmlal.u16 q15, d13, d4[1]
+ 116: add r12, r9, #0x118
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ vmlal.u16 q14, d24, d4[0]
+ vmlal.u16 q15, d25, d4[0]
+ vmlal.u16 q14, d11, d4[0]
+ vmlal.u16 q15, d12, d4[0]
+ 115: add r12, r9, #0x120
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d3[3]
+ vmlal.u16 q15, d25, d3[3]
+ vmlal.u16 q14, d10, d3[3]
+ vmlal.u16 q15, d11, d3[3]
+ 114: add r12, r9, #0x128
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ vmlal.u16 q14, d24, d3[2]
+ vmlal.u16 q15, d25, d3[2]
+ vmlal.u16 q14, d9, d3[2]
+ vmlal.u16 q15, d10, d3[2]
+ 113: add r12, r9, #0x130
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ vmlal.u16 q14, d24, d3[1]
+ vmlal.u16 q15, d25, d3[1]
+ vmlal.u16 q14, d8, d3[1]
+ vmlal.u16 q15, d9, d3[1]
+ 112: add r12, r9, #0x138
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ add r12, r9, #0x1f8
+ bic r12, r12, #0x200
+ vld1.u16 {d26}, [r12:64]
+ vmlal.u16 q14, d24, d3[0]
+ vmlal.u16 q15, d25, d3[0]
+ vmlal.u16 q14, d26, d3[0] @ Could be d7, without the load, right?
+ vmlal.u16 q15, d8, d3[0]
+ 111: add r12, r9, #0x140
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ add r12, r9, #0x1f0
+ bic r12, r12, #0x200
+ vld1.u16 {d26,d27}, [r12:128]
+ vmlal.u16 q14, d24, d2[3]
+ vmlal.u16 q15, d25, d2[3]
+ vmlal.u16 q14, d26, d2[3]
+ vmlal.u16 q15, d27, d2[3]
+ 110: add r12, r9, #0x148
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ add r12, r9, #0x1e8
+ bic r12, r12, #0x200
+ vld1.u16 {d26}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d27}, [r12:64]
+ vmlal.u16 q14, d24, d2[2]
+ vmlal.u16 q15, d25, d2[2]
+ vmlal.u16 q14, d26, d2[2]
+ vmlal.u16 q15, d27, d2[2]
+ 109: add r12, r9, #0x150
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ add r12, r9, #0x1e0
+ bic r12, r12, #0x200
+ vld1.u16 {d26,d27}, [r12:128]
+ vmlal.u16 q14, d24, d2[1]
+ vmlal.u16 q15, d25, d2[1]
+ vmlal.u16 q14, d26, d2[1]
+ vmlal.u16 q15, d27, d2[1]
+ 108: add r12, r9, #0x158
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ add r12, r9, #0x1d8
+ bic r12, r12, #0x200
+ vld1.u16 {d26}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d27}, [r12:64]
+ vmlal.u16 q14, d24, d2[0]
+ vmlal.u16 q15, d25, d2[0]
+ vmlal.u16 q14, d26, d2[0]
+ vmlal.u16 q15, d27, d2[0]
+ 107: add r12, r9, #0x160
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ add r12, r9, #0x1d0
+ bic r12, r12, #0x200
+ vld1.u16 {d26,d27}, [r12:128]
+ vmlal.u16 q14, d24, d1[3]
+ vmlal.u16 q15, d25, d1[3]
+ vmlal.u16 q14, d26, d1[3]
+ vmlal.u16 q15, d27, d1[3]
+ 106: add r12, r9, #0x168
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ add r12, r9, #0x1c8
+ bic r12, r12, #0x200
+ vld1.u16 {d26}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d27}, [r12:64]
+ vmlal.u16 q14, d24, d1[2]
+ vmlal.u16 q15, d25, d1[2]
+ vmlal.u16 q14, d26, d1[2]
+ vmlal.u16 q15, d27, d1[2]
+ 105: add r12, r9, #0x170
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ add r12, r9, #0x1c0
+ bic r12, r12, #0x200
+ vld1.u16 {d26,d27}, [r12:128]
+ vmlal.u16 q14, d24, d1[1]
+ vmlal.u16 q15, d25, d1[1]
+ vmlal.u16 q14, d26, d1[1]
+ vmlal.u16 q15, d27, d1[1]
+ 104: add r12, r9, #0x178
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ add r12, r9, #0x1b8
+ bic r12, r12, #0x200
+ vld1.u16 {d26}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d27}, [r12:64]
+ vmlal.u16 q14, d24, d1[0]
+ vmlal.u16 q15, d25, d1[0]
+ vmlal.u16 q14, d26, d1[0]
+ vmlal.u16 q15, d27, d1[0]
+ 103: add r12, r9, #0x180
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]
+ add r12, r9, #0x1b0
+ bic r12, r12, #0x200
+ vld1.u16 {d26,d27}, [r12:128]
+ vmlal.u16 q14, d24, d0[3]
+ vmlal.u16 q15, d25, d0[3]
+ vmlal.u16 q14, d26, d0[3]
+ vmlal.u16 q15, d27, d0[3]
+ 102: add r12, r9, #0x188
+ bic r12, r12, #0x200
+ vld1.u16 {d24}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d25}, [r12]
+ add r12, r9, #0x1a8
+ bic r12, r12, #0x200
+ vld1.u16 {d26}, [r12:64]!
+ bic r12, r12, #0x200
+ vld1.u16 {d27}, [r12:64]
+ vmlal.u16 q14, d24, d0[2]
+ vmlal.u16 q15, d25, d0[2]
+ vmlal.u16 q14, d26, d0[2]
+ vmlal.u16 q15, d27, d0[2]
+ 101: add r12, r9, #0x190
+ bic r12, r12, #0x200
+ vld1.u16 {d24,d25}, [r12:128]!
+ bic r12, r12, #0x200
+ vld1.u16 {d26,d27}, [r12:128]
+ vmlal.u16 q14, d24, d0[1]
+ vmlal.u16 q15, d25, d0[1]
+ vmlal.u16 q14, d26, d0[1]
+ vmlal.u16 q15, d27, d0[1]
+
+ vqrshrn.u32 d28, q14, #16
+ vqrshrn.u32 d29, q15, #16
+ vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+ vst1.u8 {q4}, [r9:128]!
+ bic r9, r9, #0x200
+ vmov q4, q5
+ vmov q5, q6
+ vmov q6, q7
+ vmov q7, q8
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+.endm/*}}}*/
+
+/* Dedicated function wrapper for the fetch macro, for the cases where
+ * performance isn't that important, to keep code size down.
+ */
+PRIVATE(fetch_generic_asm)
+ push {r10,r11}
+ fetch
+ pop {r10,r11}
+ bx lr
+END(fetch_generic_asm)
+
+
+/* Fetch the next (16 - (r10 & 15)) columns of data, avoiding reading memory
+ * beyond that limit, and filling the rest of the vector with the last legal
+ * pixel.
+ * Result is in q10 and q11. q8 and q9 are filled with the first legal pixel.
+ * Note: This function can read beyond the right edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampleft1)
+ push {r12,lr}
+ bl fetch_generic_asm
+ vdup.u16 q8, d20[0]
+ vdup.u16 q9, d20[0]
+ ands r12, r10, #15
+ beq 1f
+ sub r1, r1, r12
+ sub r10, r10, r12
+ sub sp, sp, #32
+ vst1.u16 {q10,q11}, [sp]
+ sub r12, sp, r12, LSL #1
+ sub sp, sp, #32
+ vst1.u16 {q8,q9}, [sp]
+ vld1.u16 {q10,q11}, [r12]
+ add sp, sp, #64
+1: pop {r12,pc}
+END(fetch_clampleft1)
+
+PRIVATE(fetch_clampleft4)
+ push {r12,lr}
+ bl fetch_generic_asm
+ vmov.u16 d16, d20
+ vmov.u16 d17, d20
+ vmov.u16 d18, d20
+ vmov.u16 d19, d20
+ ands r12, r10, #15
+ beq 1f
+ sub r1, r1, r12
+ sub r10, r10, r12
+ sub sp, sp, #32
+ vst1.u16 {q10-q11}, [sp]
+ sub r12, sp, r12, LSL #1
+ sub sp, sp, #32
+ vst1.u16 {q8,q9}, [sp]
+ vld1.u16 {q10,q11}, [r12]
+ add sp, sp, #64
+1: pop {r12,pc}
+END(fetch_clampleft4)
+
+/* Fetch only the next (r11 & 15) (where 0 means 16) columns of data, avoiding
+ * reading memory beyond that limit, and filling the rest of the vector with
+ * the last legal pixel.
+ * Result is in q10 and q11. q12 and q13 are filled with the last legal pixel.
+ * Note: This function can read beyond the left edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampright1)
+ push {r12, lr}
+ rsb r12, r11, #0
+ ands r12, r12, #15
+ beq 1f
+ sub r1, r1, r12
+ bl fetch_generic_asm
+ vdup.u16 q12, d23[3]
+ vdup.u16 q13, d23[3]
+ rsb r12, r11, #0
+ and r12, r12, #15
+ sub sp, sp, #32
+ vst1.u16 {q12,q13}, [sp]
+ sub sp, sp, #32
+ add r12, sp, r12, LSL #1
+ vst1.u16 {q10,q11}, [sp]
+ vld1.u16 {q10,q11}, [r12]
+ add sp, sp, #64
+ pop {r12,pc}
+1: bl fetch_generic_asm
+ vdup.u16 q12, d23[3]
+ vdup.u16 q13, d23[3]
+ pop {r12,pc}
+END(fetch_clampright1)
+
+PRIVATE(fetch_clampright4)
+ push {r12, lr}
+ rsb r12, r11, #0
+ ands r12, r12, #15
+ beq 1f
+ sub r1, r1, r12
+ bl fetch_generic_asm
+ vmov.u16 d24, d23
+ vmov.u16 d25, d23
+ vmov.u16 d26, d23
+ vmov.u16 d27, d23
+ rsb r12, r11, #0
+ and r12, r12, #15
+ sub sp, sp, #32
+ vst1.u16 {q12-q13}, [sp]
+ sub sp, sp, #32
+ add r12, sp, r12, LSL #1
+ vst1.u16 {q10,q11}, [sp]
+ vld1.u16 {q10,q11}, [r12]
+ add sp, sp, #64
+ pop {r12,pc}
+1: bl fetch_generic_asm
+ vmov.u16 d24, d23
+ vmov.u16 d25, d23
+ vmov.u16 d26, d23
+ vmov.u16 d27, d23
+ pop {r12,pc}
+END(fetch_clampright4)
+
+/* Given values in q10 and q11, and an index in r11, sweep the (r11 & 15)th
+ * value across to fill the rest of the register pair. Used for filling the
+ * right hand edge of the window when reading too close to the right hand edge
+ * of the image.
+ * Also returns a dup-ed copy of the last element in q12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
+ */
+PRIVATE(prefill_sweepright1)
+ ands r12, r11, #15
+ beq 1f
+ sub r12, r12, #1
+ sub sp, sp, #64
+ vst1.u16 {q10,q11}, [sp]
+ add r12, sp, r12, LSL #1
+ vld1.u16 {d24[],d25[]}, [r12]
+ vld1.u16 {d26[],d27[]}, [r12]
+ vst1.u16 {q12,q13}, [r12]
+ vld1.u16 {q10,q11}, [sp]
+ add sp, sp, #64
+ bx lr
+1: vdup.u16 q12, d23[3]
+ vdup.u16 q13, d23[3]
+ bx lr
+END(prefill_sweepright1)
+
+PRIVATE(prefill_sweepright4)
+ ands r12, r11, #15
+ beq 1f
+ sub r12, r12, #4
+ sub sp, sp, #64
+ vst1.u16 {q10,q11}, [sp]
+ add r12, sp, r12, LSL #1
+ vld1.u64 {d24}, [r12]
+ vld1.u64 {d25}, [r12]
+ vld1.u64 {d26}, [r12]
+ vld1.u64 {d27}, [r12]
+ vst1.u16 {q12,q13}, [r12]
+ vld1.u16 {q10,q11}, [sp]
+ add sp, sp, #64
+ bx lr
+1: vmov.u16 d24, d23
+ vmov.u16 d25, d23
+ vmov.u16 d26, d23
+ vmov.u16 d27, d23
+ bx lr
+END(prefill_sweepright4)
+
+/* The main loop keeps a sliding window of data that has already been convolved
+ * in the vertical axis for the current line. This usually stays in the
+ * register file, but spills to memory for large windows. The first thing that
+ * needs to be done at start-up is to fill this window with image data, taking
+ * into account the padding needed if the left or right edges of the image fall
+ * within this window.
+ */
+
+/* Because the window is in the register file writes to it cannot be indexed
+ * by another register. Consequently the fill loops are unrolled to address
+ * the registers directly. This macro distinguishes between writes to the
+ * register file and writes to the spill buffer (indicated by a destination
+ * register named xx).
+ */
+.macro prefill_out ra, rb, sra, srb, srb_hi
+ .ifc \ra,xx
+ .ifc \rb,xx
+ vst1.u16 {\sra,\srb}, [r9:128]!
+ .else
+ /* this case is used only for the last tap of uchar1 r=25 */
+ /* discard \sra */
+ vmov.u16 \rb, \srb_hi
+ .endif
+ .else
+ .ifnc \ra,\sra
+ vmov.u16 \ra, \sra
+ .endif
+ .ifnc \rb,\srb
+ vmov.u16 \rb, \srb
+ .endif
+ .endif
+.endm
+
+/* This macro provides the list of registers representing the window, and the
+ * cases where the register file is too small and a spill buffer is used
+ * instead.
+ * Since several specialisations of each function are generated, this also
+ * culls superfluous iterations, and sets the variable `i` for subsequent
+ * macros indicating the current index into the window.
+ */
+.macro prefill_list, macro, nextmacro, max_r, step, label
+ .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
+ .if windowsize >= (\line * 16)
+ .set i, windowsize - (\line * 16)
+\label\macro\line:
+ prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
+ .endif
+ .endm
+ .if \step > 1
+ ifneeded \macro \nextmacro, 13, 12, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 12, 11, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 11, 10, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 10, 9, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 9, 8, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 8, 7, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 7, 6, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 6, 5, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 5, 4, xx, xx, \step, \label
+ ifneeded \macro \nextmacro, 4, 3, xx, xx, \step, \label
+ .else
+ /* q3 normally contains the coefficient table, but it's not fully
+ * used. In the uchar1, r=25 case the other half of q3 is used for
+ * the last two window taps to avoid falling out to memory.
+ */
+ ifneeded \macro \nextmacro, 4, 3, xx, d7, \step, \label
+ .endif
+ ifneeded \macro \nextmacro, 3, 2, q4, q5, \step, \label
+ ifneeded \macro \nextmacro, 2, 1, q6, q7, \step, \label
+ ifneeded \macro \nextmacro, 1, 0, q8, q9, \step, \label
+
+\label\macro\()0:
+ b \label\()_end
+ .purgem ifneeded
+.endm
+
+/* These macros represent the possible stages of filling the window.
+ * Each macro is unrolled enough times that it can fill the entire window
+ * itself, but normally it will have to hand control to subsequent macros
+ * part-way through and this is done using labels named \next and \after, where
+ * \next is the next macro starting at the same window position and \after is
+ * the next macro starting after the current window position.
+ */
+
+/* leftfill: v8 and v9 contain the left padding value. While the window
+ * extends outside of the image on the left-hand side, and at least 16 more
+ * padding values are needed in the window, store v8 and v9 into the window.
+ * Otherwise skip forward to storing image data.
+ */
+.macro prefill_leftfill, next, after, ra, rb, step
+ cmp r10, #i+16
+ blo \next
+ prefill_out \ra, \rb, q8, q9, d19
+.endm
+
+/* leftedge: The very first non-fill or partial-fill chunk from the image is
+ * already loaded (as it was used to calculate the left padding value), so
+ * store it here, and then drop into the regular load/store cycle in the next
+ * macro.
+ */
+.macro prefill_leftedge, next, after, ra, rb, step
+1: prefill_out \ra, \rb, q10, q11, d23
+ b \after
+.endm
+
+/* dofetch: Copy chunks of the image into the window without any complications
+ * from edge conditions.
+ */
+.macro prefill_dofetch, next, after, ra, rb, step
+ cmp r11, #i+16
+ bls \next
+ bl fetch_generic_asm
+ prefill_out \ra, \rb, q10, q11, d23
+.endm
+
+/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
+ * the right-hand edge of the image. In that case sweep the last valid pixel
+ * across the rest of the chunk, and in either case prepare padding data in v12
+ * and v13 for the next macro. This is done in fetch_clampright.
+ * This only happens once before going on to the next macro.
+ * Sometimes leftedge also covers the rightedge case, in which case this has
+ * to be skipped altogether.
+ */
+.macro prefill_rightedge, next, after, ra, rb, step
+ cmp r11, #i
+ bls \next
+ bl fetch_clampright\step
+ prefill_out \ra, \rb, q10, q11, d23
+ b \after
+.endm
+
+/* rightfill: The rest of the window is simply filled with right padding from
+ * v12 and v13.
+ */
+.macro prefill_rightfill, next, after, ra, rb, step
+ prefill_out \ra, \rb, q12, q13, d25
+.endm
+
+/* Here all of the macros above are unrolled and laid out in the proper order.
+ */
+.macro prefill_body, max_r, step, label
+ prefill_list leftfill, leftedge, \max_r, \step, \label
+ prefill_list leftedge, dofetch, \max_r, \step, \label
+ prefill_list dofetch, rightedge, \max_r, \step, \label
+ prefill_list rightedge, rightfill, \max_r, \step, \label
+ prefill_list rightfill, oops, \max_r, \step, \label
+\label\()_end:
+.endm
+
+/* Fill the convolution window with context data. The aim here is to load
+ * exactly 2*r columns, and in the main loop to read as many columns as will be
+ * written. This is complicated by the window being divided into chunks at
+ * register boundaries, and the need to handle cases when the input starts very
+ * close to the left or right (or both) edges of the image and the need to fill
+ * the spaces that leaves with left and right edge padding values.
+ *
+ * Input:
+ * r1 -- src
+ * r2 -- pitch
+ * r3 -- count
+ * r4 -- available image data right of src pointer
+ * r5 -- r
+ * r6 -- rup
+ * r7 -- rdn
+ * r8 -- available image data left of src pointer
+ * r9 -- buffer (if needed)
+ * Output:
+ * r4 -= min(inlen, count + windowsize - centertap)
+ * r1 += min(inlen, count + windowsize - centertap)
+ * Modifies:
+ * r10 -- fill start index in the window
+ * r11 -- fill stop index in the window
+ * r12 -- scratch
+ */
+.macro prefill step=1, max_r=25, label=xx
+.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
+.set centertap, (windowsize - \max_r * \step)
+ mov r10, #centertap
+ subs r10, r10, r8
+ movlo r10, #0
+
+ subs r11, r4, #windowsize - centertap
+ movhs r11, #0
+ add r11, r11, #windowsize
+
+ /* r10 indicates where in the window legal image data begins.
+ * r11 indicates where in the window legal image date ends.
+ * When starting near the centre of a large image these would be
+ * zero and windowsize respectively, but when starting near the
+ * edges this can change.
+ * When starting on the leftmost pixel, r10 will be centertap.
+ * When starting on the rightmost pixel, r11 will be centertap+1.
+ */
+
+ /* r4 indicates how much data there is between the current pointers
+ * and the right edge of the image. The pointers currently point
+ * to the data needed at centertap. The subsequent code will
+ * consume (windowsize - r10) data, but only the data from
+ * centertap to windowsize comes out of r4's budget.
+ */
+1: subs r4, r4, #windowsize - centertap
+ movlo r4, #0
+
+ /* And the pointers need to rewind to the start of the window.
+ */
+ sub r1, r1, #centertap
+
+ /* Unless x8 indicated that there wasn't that much data available.
+ */
+ add r1, r1, r10
+
+
+ /* Get the first chunk, and add padding to align it to the window
+ * if necessary.
+ */
+ bl fetch_clampleft\step
+
+ /* Sometimes the start and the end of the window are in the same
+ * chunk. In that case both ends need filler at the outset.
+ */
+ sub r12, r11, #1
+ eor r12, r10, r12
+ cmp r12, #16
+ bllo prefill_sweepright\step
+
+ /* Iterate through all the points in the window and fill them in
+ * with padding or image data as needed.
+ */
+ prefill_body \max_r, \step, \label
+.endm
+
+/* The main body of the convolve functions. Having already pre-filled the
+ * convolution window with 2*r input values, the logic settles into a regular
+ * pattern of reading and writing at a 1:1 rate until either input or output
+ * expires. The input leads the output by r values, so when processing all the
+ * way to the right-hand edge, or within r pixels of that edge, the input will
+ * run out first. In the case of very narrow images, or sub-windows starting
+ * near the right edge, the input may already have run out while the
+ * convolution window was being filled and this loop will start with a
+ * zero-length input.
+ *
+ * Once the input runs out, the rest of the output must be processed by padding
+ * the remainder of the window with pad value from the last valid pixel from
+ * the source.
+ *
+ * Input:
+ * r0 = dst
+ * r1 = src
+ * r2 = pitch
+ * r3 = count
+ * r4 = inlen
+ * r5 = r
+ * r6 = rup
+ * r7 = rdn
+ * r9 = buffer
+ * Modifies
+ * r8 = fetch code pointer
+ */
+.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
+
+ /* If x4 >= x3 then there's no need for clipping. The main loop
+ * needs to exit when either x3 or x4 runs out, so clamp x4 to be
+ * no greater than x3 and use x4 for the loop.
+ * However, if x4 comes out of the loop with less than 16 bytes
+ * left, a partial read would be necessary to avoid reading beyond
+ * the end of the image. To avoid this, clamp x4 to the next
+ * multiple of 16, which is still sufficient to force it out of the
+ * loop but doesn't imply a rewind.
+ */
+ add r12, r3, #15
+ bic r12, r12, #15
+ cmp r4, r12
+ movhi r4, r12
+
+ /* First calculate the entry-point into the internal fetch logic.
+ * This is done so the same function can service several kernel
+ * sizes.
+ */
+ ldr r8, 3f
+1: add r8, r8, pc
+ sub r8, r5, LSL #5
+ sub r8, r5, LSL #4
+ cmp r5, r6
+ cmpeq r5, r7
+ beq 5f
+
+ /* if (r != rup || r != rdn) then the address-clamping table should
+ * be used rather than the short-cut version.
+ */
+ ldr r8, 3f+4
+2: add r8, r8, pc
+ sub r8, r5, LSL #6
+ b 5f
+ .align 3
+3: .word \labelnc-1b-8
+ .word \labelc-2b-8
+
+ /* Main loop: ... */
+ .align 4
+3: /* first perform a vertical convolution from memory to get the next
+ * 16 taps of the horizontal window into the register file...
+ */
+ fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=r8
+
+ /* ...then perform a horizontal convolution on that window to
+ * produce eight output bytes, and slide the window along.
+ * This has to be done twice to match the 16-way vertical pass.
+ * It would be preferable to have twice the work done in \core, but
+ * that would demand yet another variant on those macros and would
+ * perturb the register allocation severely.
+ */
+ \core
+ vst1.u8 {d31}, [r0]!
+ \core
+ vst1.u8 {d31}, [r0]!
+
+ sub r3, r3, #16
+5: subs r4, r4, #16
+ bhi 3b
+ /* Here there's 16 or fewer bytes available before the edge of the
+ * source image. x4 holds that count minus 16 (because it was
+ * decremented before the first iteration ran). The last read may
+ * not be a whole chunk, and beyond that a fill value must be used.
+ *
+ * Of course, none of that matters if there's no more output to
+ * produce...
+ */
+ cmp r3, #0
+ beq 5f
+
+ /* Oh well. */
+ adds r4, r4, #16
+ bne 1f
+ .if \step==1
+ vdup.u16 q10, d19[3]
+ vdup.u16 q11, d19[3]
+ .else
+ vmov.u64 d20, d19
+ vmov.u64 d21, d19
+ vmov.u64 d22, d19
+ vmov.u64 d23, d19
+ .endif
+ b 3f
+
+ /* To avoid reading past end of input, rewind pointers by (16-r4)
+ * to ensure that they're exactly 16 bytes from the edge.
+ */
+1: mov r11, r4
+ bl fetch_clampright\step
+ /* Now to put this padding to use, perform any remaining
+ * iterations. This is done at half the rate of the main loop,
+ * because there's no longer pressure from a 16-lane window filler.
+ */
+3: \core
+ .if \step==1
+ vdup.u16 q11, d23[3]
+ .else
+ vmov.u64 d22, d23
+ .endif
+ subs r3, r3, #8
+ blo 4f
+ vst1.u8 {d31}, [r0]!
+ bne 3b
+ b 5f
+
+ /* If the final iteration contained 0 < l < 8 values, then perform
+ * a piecewise store of the final vector.
+ */
+4: tst r3, #4
+ beq 1f
+ vst1.u32 {d31[0]}, [r0]!
+ vext.u8 d31, d31, d31, #4
+1: tst r3, #2
+ beq 1f
+ vst1.u16 {d31[0]}, [r0]!
+ vext.u8 d31, d31, d31, #2
+1: tst r3, #1
+ beq 5f
+ vst1.u8 {d31[0]}, [r0]!
+ vext.u8 d31, d31, d31, #1
+5: mov r0, #0
+.endm
+
+.irp r, TUNED_LIST1, 25
+PRIVATE(convolve1_\r)
+ push {r12,lr}
+
+ prefill step=1, max_r=\r, label=.Lcnv1_\r
+
+ conv_body core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
+
+ pop {r12,pc}
+END(convolve1_\r)
+.endr
+
+.irp r, TUNED_LIST4, 25
+PRIVATE(convolve4_\r)
+ push {r12,lr}
+ sub r9, sp, #0x200
+ sub sp, sp, #0x200 + 0x400
+ bic r9, r9, #0x3fc
+
+ /* r9 now points to a 0x200 byte buffer on the stack whose address
+ * has the low 10 bits clear. This allows easy address calculation
+ * in the wrap-around cases.
+ */
+
+ prefill step=4, max_r=\r, label=.Lcnv4_\r
+
+ conv_body core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
+
+ add sp, sp, #0x200 + 0x400
+ pop {r12,pc}
+END(convolve4_\r)
+.endr
+
+/* void rsdIntrinsicBlurU1_K(
+ * void *out, // r0
+ * void *in, // r1
+ * size_t w, // r2
+ * size_t h, // r3
+ * size_t p, // [sp]
+ * size_t x, // [sp,#4]
+ * size_t y, // [sp,#8]
+ * size_t count, // [sp,#12]
+ * size_t r, // [sp,#16]
+ * uint16_t *tab); // [sp,#20]
+ */
+ENTRY(rsdIntrinsicBlurU1_K)
+ push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ vpush {d8-d15}
+ ldr r6, [sp,#112] // y
+ ldr r8, [sp,#108] // x
+ ldr r5, [sp,#120] // r
+ sub r4, r2, r8 // inlen = w - x
+ sub r7, r3, r6 // h - y
+ ldr r2, [sp,#104] // pitch
+ ldr r3, [sp,#116] // count
+ sub r7, r7, #1 // h - y - 1
+
+ ldr r12, [sp,#124]
+
+ add r1, r1, r8 // src += x
+
+ cmp r6, r5
+ movhi r6, r5 // rup = min(r, y)
+ cmp r7, r5
+ movhi r7, r5 // rdn = min(r, h - y - 1)
+
+ vld1.u16 {d0,d1,d2,d3}, [r12]!
+ vld1.u16 {d4,d5,d6}, [r12]!
+
+ adr lr, 1f
+ .irp r, TUNED_LIST1
+ cmp r5, #\r
+ bls convolve1_\r
+ .endr
+ b convolve1_25
+
+1: vpop {d8-d15}
+ pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+END(rsdIntrinsicBlurU1_K)
+
+/* void rsdIntrinsicBlurU4_K(
+ * void *out, // r0
+ * void *in, // r1
+ * size_t w, // r2
+ * size_t h, // r3
+ * size_t p, // [sp]
+ * size_t x, // [sp,#4]
+ * size_t y, // [sp,#8]
+ * size_t count, // [sp,#12]
+ * size_t r, // [sp,#16]
+ * uint16_t *tab); // [sp,#20]
+ */
+ENTRY(rsdIntrinsicBlurU4_K)
+ push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ vpush {d8-d15}
+ ldr r6, [sp,#112] // y
+ ldr r8, [sp,#108] // x
+ ldr r5, [sp,#120] // r
+ lsl r8, r8, #2
+ rsb r4, r8, r2, LSL #2 // inlen = (w - x)
+ sub r7, r3, r6 // h - y
+ ldr r2, [sp,#104] // pitch
+ ldr r3, [sp,#116] // count
+ sub r7, r7, #1 // h - y - 1
+ lsl r3, r3, #2 // count
+
+ ldr r12, [sp,#124]
+
+ add r1, r1, r8 // in += x
+
+ cmp r6, r5
+ movhi r6, r5 // rup = min(r, y)
+ cmp r7, r5
+ movhi r7, r5 // rdn = min(r, h - y - 1)
+
+ vld1.u16 {d0,d1,d2,d3}, [r12]!
+ vld1.u16 {d4,d5,d6}, [r12]!
+
+ adr lr, 1f
+ .irp r, TUNED_LIST4
+ cmp r5, #\r
+ bls convolve4_\r
+ .endr
+ b convolve4_25
+
+1: vpop {d8-d15}
+ pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+END(rsdIntrinsicBlurU4_K)
diff --git a/toolkit/ColorMatrix.cpp b/toolkit/ColorMatrix.cpp
new file mode 100644
index 0000000..dd426cf
--- /dev/null
+++ b/toolkit/ColorMatrix.cpp
@@ -0,0 +1,1066 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+#include <assert.h>
+#include <cstdint>
+#include <sys/mman.h>
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.ColorMatrix"
+
+/* uint kernel
+ * Q0 D0: Load slot for R
+ * D1: Load slot for G
+ * Q1 D2: Load slot for B
+ * D3: Load slot for A
+ * Q2 D4: Matrix
+ * D5: =
+ * Q3 D6: =
+ * D7: =
+ * Q4 D8: Add R
+ * D9:
+ * Q5 D10: Add G
+ * D11:
+ * Q6 D12: Add B
+ * D13:
+ * Q7 D14: Add A
+ * D15:
+ * Q8 D16: I32: R Sum
+ * D17:
+ * Q9 D18: I32: G Sum
+ * D19:
+ * Q10 D20: I32: B Sum
+ * D21:
+ * Q11 D22: I32: A Sum
+ * D23:
+ * Q12 D24: U16: expanded R
+ * D25:
+ * Q13 D26: U16: expanded G
+ * D27:
+ * Q14 D28: U16: expanded B
+ * D29:
+ * Q15 D30: U16: expanded A
+ * D31:
+ *
+ */
+
+/* float kernel
+ * Q0 D0: Load slot for R
+ * D1: =
+ * Q1 D2: Load slot for G
+ * D3: =
+ * Q2 D4: Load slot for B
+ * D5: =
+ * Q3 D6: Load slot for A
+ * D7: =
+ * Q4 D8: Matrix
+ * D9: =
+ * Q5 D10: =
+ * D11: =
+ * Q6 D12: =
+ * D13: =
+ * Q7 D14: =
+ * D15: =
+ * Q8 D16: Add R
+ * D17: =
+ * Q9 D18: Add G
+ * D19: =
+ * Q10 D20: Add B
+ * D21: =
+ * Q11 D22: Add A
+ * D23: =
+ * Q12 D24: Sum R
+ * D25: =
+ * Q13 D26: Sum G
+ * D27: =
+ * Q14 D28: Sum B
+ * D29: =
+ * Q15 D30: Sum A
+ * D31: =
+ *
+ */
+
+typedef union {
+ uint64_t key;
+ struct {
+ uint32_t inVecSize :2; // [0 - 1]
+ uint32_t outVecSize :2; // [2 - 3]
+ uint32_t inType :4; // [4 - 7]
+ uint32_t outType :4; // [8 - 11]
+ uint32_t dot :1; // [12]
+ uint32_t _unused1 :1; // [13]
+ uint32_t copyAlpha :1; // [14]
+ uint32_t _unused2 :1; // [15]
+ uint32_t coeffMask :16; // [16-31]
+ uint32_t addMask :4; // [32-35]
+ } u;
+} Key_t;
+
+/* The two data types and their value, as specified in the RenderScript documentation.
+ * Only RS_TYPE_UNSIGNED_8 is currently supported.
+ *
+ * TODO: The actual values of these constants are likely not important. We may be
+ * able to simplify the key related code.
+ */
+const int RS_TYPE_UNSIGNED_8 = 8;
+const int RS_TYPE_FLOAT_32 = 2;
+
+//Re-enable when intrinsic is fixed
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+typedef struct {
+ void (*column[4])(void);
+ void (*store)(void);
+ void (*load)(void);
+ void (*store_end)(void);
+ void (*load_end)(void);
+} FunctionTab_t;
+
+extern "C" void rsdIntrinsicColorMatrix_int_K(
+ void *out, void const *in, size_t count,
+ FunctionTab_t const *fns,
+ int16_t const *mult, int32_t const *add);
+
+extern "C" void rsdIntrinsicColorMatrix_float_K(
+ void *out, void const *in, size_t count,
+ FunctionTab_t const *fns,
+ float const *mult, float const *add);
+
+/* The setup functions fill in function tables to be used by above functions;
+ * this code also eliminates jump-to-another-jump cases by short-circuiting
+ * empty functions. While it's not performance critical, it works out easier
+ * to write the set-up code in assembly than to try to expose the same symbols
+ * and write the code in C.
+ */
+extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
+ FunctionTab_t *fns,
+ uint32_t mask, int dt, int st);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
+ FunctionTab_t *fns,
+ uint32_t mask, int dt, int st);
+#endif
+
+class ColorMatrixTask : public Task {
+ const void* mIn;
+ void* mOut;
+ size_t mInputVectorSize;
+ uint32_t mOutstep;
+ uint32_t mInstep;
+
+ float mFp[16];
+ float mFpa[4];
+
+ // The following four fields are read as constants
+ // by the SIMD assembly code.
+ int16_t mIp[16];
+ int mIpa[4];
+ float mTmpFp[16];
+ float mTmpFpa[4];
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+ FunctionTab_t mFnTab;
+#endif
+
+ void kernel(uchar* out, uchar* in, uint32_t xstart, uint32_t xend);
+ void updateCoeffCache(float fpMul, float addMul);
+
+ Key_t mLastKey;
+ unsigned char* mBuf;
+ size_t mBufSize;
+
+ bool build(Key_t key);
+ void (*mOptKernel)(void* dst, const void* src, const int16_t* coef, uint32_t count);
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+ Key_t computeKey(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
+ void preLaunch(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
+#else
+ Key_t computeKey(size_t inVectorSize, size_t outVectorSize);
+ void preLaunch(size_t inVectorSize, size_t outVectorSize);
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ public:
+ ColorMatrixTask(const void* in, void* out, size_t inputVectorSize, size_t outputVectorSize,
+ size_t sizeX, size_t sizeY, const float* matrix, const float* addVector,
+ const Restriction* restriction)
+ : Task{sizeX, sizeY, outputVectorSize, true, restriction},
+ mIn{in},
+ mOut{out},
+ mInputVectorSize{inputVectorSize} {
+ mLastKey.key = 0;
+ mBuf = nullptr;
+ mBufSize = 0;
+ mOptKernel = nullptr;
+
+ mOutstep = paddedSize(outputVectorSize);
+ mInstep = paddedSize(inputVectorSize);
+
+ memcpy(mFp, matrix, sizeof(mFp));
+ memcpy(mFpa, addVector, sizeof(mFpa));
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+ // For float support, we'll have to pass the type in the constructor too.
+ preLaunch(inputVectorSize, RS_TYPE_UNSIGNED_8, outputVectorSize, RS_TYPE_UNSIGNED_8);
+#else
+ preLaunch(inputVectorSize, outputVectorSize);
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+ }
+ ~ColorMatrixTask() {
+ if (mBuf) munmap(mBuf, mBufSize);
+ mBuf = nullptr;
+ mOptKernel = nullptr;
+ }
+};
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+Key_t ColorMatrixTask::computeKey(size_t inVectorSize, int inType, size_t outVectorSize,
+ int outType) {
+ Key_t key;
+ key.key = 0;
+
+ // Compute a unique code key for this operation
+
+ // Add to the key the input and output types
+ bool hasFloat = false;
+ if (inType == RS_TYPE_FLOAT_32) {
+ hasFloat = true;
+ key.u.inType = RS_TYPE_FLOAT_32;
+ }
+ if (outType == RS_TYPE_FLOAT_32) {
+ hasFloat = true;
+ key.u.outType = RS_TYPE_FLOAT_32;
+ }
+
+ // Mask in the bits indicating which coefficients in the
+ // color matrix are needed.
+ if (hasFloat) {
+ for (uint32_t i=0; i < 16; i++) {
+ if (fabs(mFp[i]) != 0.f) {
+ key.u.coeffMask |= 1 << i;
+ }
+ }
+ if (fabs(mFpa[0]) != 0.f) key.u.addMask |= 0x1;
+ if (fabs(mFpa[1]) != 0.f) key.u.addMask |= 0x2;
+ if (fabs(mFpa[2]) != 0.f) key.u.addMask |= 0x4;
+ if (fabs(mFpa[3]) != 0.f) key.u.addMask |= 0x8;
+
+ } else {
+#else
+Key_t ColorMatrixTask::computeKey(size_t inVectorSize, size_t outVectorSize) {
+ Key_t key;
+ key.key = 0;
+
+ // Compute a unique code key for this operation
+ {
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+ for (uint32_t i=0; i < 16; i++) {
+ if (mIp[i] != 0) {
+ key.u.coeffMask |= 1 << i;
+ }
+ }
+ if (mIpa[0] != 0) key.u.addMask |= 0x1;
+ if (mIpa[1] != 0) key.u.addMask |= 0x2;
+ if (mIpa[2] != 0) key.u.addMask |= 0x4;
+ if (mIpa[3] != 0) key.u.addMask |= 0x8;
+ }
+
+ // Look for a dot product where the r,g,b colums are the same
+ if ((mIp[0] == mIp[1]) && (mIp[0] == mIp[2]) &&
+ (mIp[4] == mIp[5]) && (mIp[4] == mIp[6]) &&
+ (mIp[8] == mIp[9]) && (mIp[8] == mIp[10]) &&
+ (mIp[12] == mIp[13]) && (mIp[12] == mIp[14])) {
+
+ if (!key.u.addMask) key.u.dot = 1;
+ }
+
+ // Is alpha a simple copy
+ if (!(key.u.coeffMask & 0x0888) && (mIp[15] == 256) && !(key.u.addMask & 0x8)) {
+ key.u.copyAlpha = !(key.u.inType || key.u.outType);
+ }
+
+ //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
+
+ switch (inVectorSize) {
+ case 4:
+ key.u.inVecSize = 3;
+ break;
+ case 3:
+ key.u.inVecSize = 2;
+ key.u.coeffMask &= ~0xF000;
+ break;
+ case 2:
+ key.u.inVecSize = 1;
+ key.u.coeffMask &= ~0xFF00;
+ break;
+ default:
+ key.u.coeffMask &= ~0xFFF0;
+ break;
+ }
+
+ switch (outVectorSize) {
+ case 4:
+ key.u.outVecSize = 3;
+ break;
+ case 3:
+ key.u.outVecSize = 2;
+ key.u.coeffMask &= ~0x8888;
+ key.u.addMask &= 7;
+ break;
+ case 2:
+ key.u.outVecSize = 1;
+ key.u.coeffMask &= ~0xCCCC;
+ key.u.addMask &= 3;
+ break;
+ default:
+ key.u.coeffMask &= ~0xEEEE;
+ key.u.addMask &= 1;
+ break;
+ }
+
+ if (key.u.inType && !key.u.outType) {
+ key.u.addMask |= 1;
+ if (key.u.outVecSize > 0) key.u.addMask |= 2;
+ if (key.u.outVecSize > 1) key.u.addMask |= 4;
+ if (key.u.outVecSize > 2) key.u.addMask |= 8;
+ }
+
+ //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
+ return key;
+}
+
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+
+#define DEF_SYM(x) \
+ extern "C" uint32_t _N_ColorMatrix_##x; \
+ extern "C" uint32_t _N_ColorMatrix_##x##_end; \
+ extern "C" uint32_t _N_ColorMatrix_##x##_len;
+
+DEF_SYM(prefix_i)
+DEF_SYM(prefix_f)
+DEF_SYM(postfix1)
+DEF_SYM(postfix2)
+
+DEF_SYM(load_u8_4)
+DEF_SYM(load_u8_3)
+DEF_SYM(load_u8_2)
+DEF_SYM(load_u8_1)
+DEF_SYM(load_u8f_4)
+DEF_SYM(load_u8f_3)
+DEF_SYM(load_u8f_2)
+DEF_SYM(load_u8f_1)
+DEF_SYM(load_f32_4)
+DEF_SYM(load_f32_3)
+DEF_SYM(load_f32_2)
+DEF_SYM(load_f32_1)
+
+DEF_SYM(store_u8_4)
+DEF_SYM(store_u8_2)
+DEF_SYM(store_u8_1)
+DEF_SYM(store_f32_4)
+DEF_SYM(store_f32_3)
+DEF_SYM(store_f32_2)
+DEF_SYM(store_f32_1)
+DEF_SYM(store_f32u_4)
+DEF_SYM(store_f32u_2)
+DEF_SYM(store_f32u_1)
+
+DEF_SYM(unpack_u8_4)
+DEF_SYM(unpack_u8_3)
+DEF_SYM(unpack_u8_2)
+DEF_SYM(unpack_u8_1)
+DEF_SYM(pack_u8_4)
+DEF_SYM(pack_u8_3)
+DEF_SYM(pack_u8_2)
+DEF_SYM(pack_u8_1)
+DEF_SYM(dot)
+DEF_SYM(add_0_u8)
+DEF_SYM(add_1_u8)
+DEF_SYM(add_2_u8)
+DEF_SYM(add_3_u8)
+
+#define ADD_CHUNK(x) \
+ memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
+ buf += _N_ColorMatrix_##x##_len
+
+
+static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
+ size_t off = (target - buf - 8) >> 2;
+ assert(((off & 0xff000000) == 0) ||
+ ((off & 0xff000000) == 0xff000000));
+
+ uint32_t op = (condition << 28);
+ op |= 0xa << 24; // branch
+ op |= 0xffffff & off;
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
+ assert(vd < 32);
+ assert(vm < 32);
+ assert(vn < 32);
+
+ uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
+ op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
+ op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
+ return op;
+}
+
+static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+ uint32_t src_d2_s) {
+ //vmlal.s16 Q#1, D#1, D#2[#]
+ uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+ uint32_t src_d2_s) {
+ //vmull.s16 Q#1, D#1, D#2[#]
+ uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
+ //vqadd.s32 Q#1, Q#1, Q#2
+ uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+ uint32_t src_d2_s) {
+ //vmlal.f32 Q#1, D#1, D#2[#]
+ uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+ uint32_t src_d2_s) {
+ //vmull.f32 Q#1, D#1, D#2[#]
+ uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
+ //vadd.f32 Q#1, D#1, D#2
+ uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
+ //vmov.32 Q#1, #imm
+ assert(imm == 0);
+ (void) imm; // Avoid unused parameter warnings for non-debug builds
+ uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
+ //vadd.f32 Q#1, D#1, D#2
+ uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+#endif
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
+ const int16_t *coef, uint32_t count);
+extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
+ const int16_t *coef, uint32_t count);
+extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
+ const int16_t *coef, uint32_t count);
+
+using android::renderscript::Key_t;
+
+void * selectKernel(Key_t key)
+{
+ void * kernel = nullptr;
+
+ // inType, outType float if nonzero
+ if (!(key.u.inType || key.u.outType)) {
+ if (key.u.dot)
+ kernel = (void *)rsdIntrinsicColorMatrixDot_K;
+ else if (key.u.copyAlpha)
+ kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
+ else
+ kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
+ }
+
+ return kernel;
+}
+#endif
+
+bool ColorMatrixTask::build(Key_t key) {
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+ mBufSize = 4096;
+ //StopWatch build_time("rs cm: build time");
+ mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mBuf == MAP_FAILED) {
+ mBuf = NULL;
+ return false;
+ }
+
+ uint8_t *buf = mBuf;
+ uint8_t *buf2 = nullptr;
+
+ int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
+ int opInit[4] = {0, 0, 0, 0};
+
+ memset(ops, 0, sizeof(ops));
+ for (int i=0; i < 4; i++) {
+ if (key.u.coeffMask & (1 << (i*4))) {
+ ops[i][0] = 0x2 | opInit[0];
+ opInit[0] = 1;
+ }
+ if (!key.u.dot) {
+ if (key.u.coeffMask & (1 << (1 + i*4))) {
+ ops[i][1] = 0x2 | opInit[1];
+ opInit[1] = 1;
+ }
+ if (key.u.coeffMask & (1 << (2 + i*4))) {
+ ops[i][2] = 0x2 | opInit[2];
+ opInit[2] = 1;
+ }
+ }
+ if (!key.u.copyAlpha) {
+ if (key.u.coeffMask & (1 << (3 + i*4))) {
+ ops[i][3] = 0x2 | opInit[3];
+ opInit[3] = 1;
+ }
+ }
+ }
+
+ if (key.u.inType || key.u.outType) {
+ key.u.copyAlpha = 0;
+ ADD_CHUNK(prefix_f);
+ buf2 = buf;
+
+ // Load the incoming r,g,b,a as needed
+ if (key.u.inType) {
+ switch(key.u.inVecSize) {
+ case 3:
+ ADD_CHUNK(load_f32_4);
+ break;
+ case 2:
+ ADD_CHUNK(load_f32_3);
+ break;
+ case 1:
+ ADD_CHUNK(load_f32_2);
+ break;
+ case 0:
+ ADD_CHUNK(load_f32_1);
+ break;
+ }
+ } else {
+ switch(key.u.inVecSize) {
+ case 3:
+ ADD_CHUNK(load_u8f_4);
+ break;
+ case 2:
+ ADD_CHUNK(load_u8f_3);
+ break;
+ case 1:
+ ADD_CHUNK(load_u8f_2);
+ break;
+ case 0:
+ ADD_CHUNK(load_u8f_1);
+ break;
+ }
+ }
+
+ for (int i=0; i < 4; i++) {
+ for (int j=0; j < 4; j++) {
+ switch(ops[i][j]) {
+ case 0:
+ break;
+ case 2:
+ buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
+ break;
+ case 3:
+ buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
+ break;
+ }
+ }
+ }
+ for (int j=0; j < 4; j++) {
+ if (opInit[j]) {
+ if (key.u.addMask & (1 << j)) {
+ buf = addVADD_F32(buf, j, 12+j, 8+j);
+ } else {
+ buf = addVORR_32(buf, j, 12+j, 12+j);
+ }
+ } else {
+ if (key.u.addMask & (1 << j)) {
+ buf = addVORR_32(buf, j, 8+j, 8+j);
+ } else {
+ buf = addVMOV_32(buf, j, 0);
+ }
+ }
+ }
+
+ if (key.u.outType) {
+ switch(key.u.outVecSize) {
+ case 3:
+ ADD_CHUNK(store_f32_4);
+ break;
+ case 2:
+ ADD_CHUNK(store_f32_3);
+ break;
+ case 1:
+ ADD_CHUNK(store_f32_2);
+ break;
+ case 0:
+ ADD_CHUNK(store_f32_1);
+ break;
+ }
+ } else {
+ switch(key.u.outVecSize) {
+ case 3:
+ case 2:
+ ADD_CHUNK(store_f32u_4);
+ break;
+ case 1:
+ ADD_CHUNK(store_f32u_2);
+ break;
+ case 0:
+ ADD_CHUNK(store_f32u_1);
+ break;
+ }
+ }
+
+
+ } else {
+ // Add the function prefix
+ // Store the address for the loop return
+ ADD_CHUNK(prefix_i);
+ buf2 = buf;
+
+ // Load the incoming r,g,b,a as needed
+ switch(key.u.inVecSize) {
+ case 3:
+ ADD_CHUNK(load_u8_4);
+ if (key.u.copyAlpha) {
+ ADD_CHUNK(unpack_u8_3);
+ } else {
+ ADD_CHUNK(unpack_u8_4);
+ }
+ break;
+ case 2:
+ ADD_CHUNK(load_u8_3);
+ ADD_CHUNK(unpack_u8_3);
+ break;
+ case 1:
+ ADD_CHUNK(load_u8_2);
+ ADD_CHUNK(unpack_u8_2);
+ break;
+ case 0:
+ ADD_CHUNK(load_u8_1);
+ ADD_CHUNK(unpack_u8_1);
+ break;
+ }
+
+ // Add multiply and accumulate
+ // use MULL to init the output register,
+ // use MLAL from there
+ for (int i=0; i < 4; i++) {
+ for (int j=0; j < 4; j++) {
+ switch(ops[i][j]) {
+ case 0:
+ break;
+ case 2:
+ buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
+ break;
+ case 3:
+ buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
+ break;
+ }
+ }
+ }
+ for (int j=0; j < 4; j++) {
+ if (opInit[j]) {
+ if (key.u.addMask & (1 << j)) {
+ buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
+ }
+ } else {
+ if (key.u.addMask & (1 << j)) {
+ buf = addVORR_32(buf, 8+j, 4+j, 4+j);
+ }
+ }
+ }
+
+ // If we have a dot product, perform the special pack.
+ if (key.u.dot) {
+ ADD_CHUNK(pack_u8_1);
+ ADD_CHUNK(dot);
+ } else {
+ switch(key.u.outVecSize) {
+ case 3:
+ if (key.u.copyAlpha) {
+ ADD_CHUNK(pack_u8_3);
+ } else {
+ ADD_CHUNK(pack_u8_4);
+ }
+ break;
+ case 2:
+ ADD_CHUNK(pack_u8_3);
+ break;
+ case 1:
+ ADD_CHUNK(pack_u8_2);
+ break;
+ case 0:
+ ADD_CHUNK(pack_u8_1);
+ break;
+ }
+ }
+
+ // Write out result
+ switch(key.u.outVecSize) {
+ case 3:
+ case 2:
+ ADD_CHUNK(store_u8_4);
+ break;
+ case 1:
+ ADD_CHUNK(store_u8_2);
+ break;
+ case 0:
+ ADD_CHUNK(store_u8_1);
+ break;
+ }
+ }
+
+ if (key.u.inType != key.u.outType) {
+ key.u.copyAlpha = 0;
+ key.u.dot = 0;
+ }
+
+ // Loop, branch, and cleanup
+ ADD_CHUNK(postfix1);
+ buf = addBranch(buf, buf2, 0x01);
+ ADD_CHUNK(postfix2);
+
+ int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
+ if (ret == -1) {
+ ALOGE("mprotect error %i", ret);
+ return false;
+ }
+
+ __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
+ return true;
+#else
+ (void) key; // Avoid unused parameter warning.
+ return false;
+#endif
+}
+
+void ColorMatrixTask::updateCoeffCache(float fpMul, float addMul) {
+ for(int ct=0; ct < 16; ct++) {
+ mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
+ mTmpFp[ct] = mFp[ct] * fpMul;
+ //ALOGE("mat %i %f %f", ct, mFp[ct], tmpFp[ct]);
+ }
+
+ float add = 0.f;
+ if (fpMul > 254.f) add = 0.5f;
+ for(int ct=0; ct < 4; ct++) {
+ mTmpFpa[ct] = mFpa[ct] * addMul + add;
+ //ALOGE("mFpa %i %f %f", ct, mFpa[ct], tmpFpa[ct * 4 + 0]);
+ }
+
+ for(int ct=0; ct < 4; ct++) {
+ mIpa[ct] = (int)(mFpa[ct] * 65536.f + 0.5f);
+ }
+}
+
+
+
+static void One(void *out,
+ const void *py, const float* coeff, const float *add,
+ uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
+
+ float4 f = 0.f;
+ if (fin) {
+ switch(vsin) {
+ case 3:
+ f = ((const float4 *)py)[0];
+ break;
+ case 2:
+ f = ((const float4 *)py)[0];
+ f.w = 0.f;
+ break;
+ case 1:
+ f.xy = ((const float2 *)py)[0];
+ break;
+ case 0:
+ f.x = ((const float *)py)[0];
+ break;
+ }
+ } else {
+ switch(vsin) {
+ case 3:
+ f = convert<float4>(((const uchar4 *)py)[0]);
+ break;
+ case 2:
+ f = convert<float4>(((const uchar4 *)py)[0]);
+ f.w = 0.f;
+ break;
+ case 1:
+ f.xy = convert<float2>(((const uchar2 *)py)[0]);
+ break;
+ case 0:
+ f.x = (float)(((const uchar *)py)[0]);
+ break;
+ }
+ }
+ //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w);
+
+ float4 sum;
+ sum.x = f.x * coeff[0] +
+ f.y * coeff[4] +
+ f.z * coeff[8] +
+ f.w * coeff[12];
+ sum.y = f.x * coeff[1] +
+ f.y * coeff[5] +
+ f.z * coeff[9] +
+ f.w * coeff[13];
+ sum.z = f.x * coeff[2] +
+ f.y * coeff[6] +
+ f.z * coeff[10] +
+ f.w * coeff[14];
+ sum.w = f.x * coeff[3] +
+ f.y * coeff[7] +
+ f.z * coeff[11] +
+ f.w * coeff[15];
+ //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
+
+ sum.x += add[0];
+ sum.y += add[1];
+ sum.z += add[2];
+ sum.w += add[3];
+
+
+ //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
+ if (fout) {
+ switch(vsout) {
+ case 3:
+ case 2:
+ ((float4 *)out)[0] = sum;
+ break;
+ case 1:
+ ((float2 *)out)[0] = sum.xy;
+ break;
+ case 0:
+ ((float *)out)[0] = sum.x;
+ break;
+ }
+ } else {
+ sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
+ sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
+ sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
+ sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
+
+ switch(vsout) {
+ case 3:
+ case 2:
+ ((uchar4 *)out)[0] = convert<uchar4>(sum);
+ break;
+ case 1:
+ ((uchar2 *)out)[0] = convert<uchar2>(sum.xy);
+ break;
+ case 0:
+ ((uchar *)out)[0] = sum.x;
+ break;
+ }
+ }
+ //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2],
+ // ((float *)out)[3]);
+}
+
+void ColorMatrixTask::kernel(uchar *out, uchar *in, uint32_t xstart, uint32_t xend) {
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+ uint32_t vsin = mLastKey.u.inVecSize;
+ uint32_t vsout = mLastKey.u.outVecSize;
+ bool floatIn = !!mLastKey.u.inType;
+ bool floatOut = !!mLastKey.u.outType;
+
+ //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
+
+ if(x2 > x1) {
+ int32_t len = x2 - x1;
+ if (mUsesSimd) {
+ if((mOptKernel != nullptr) && (len >= 4)) {
+ // The optimized kernel processes 4 pixels at once
+ // and requires a minimum of 1 chunk of 4
+ mOptKernel(out, in, mIp, len >> 2);
+ // Update the len and pointers so the generic code can
+ // finish any leftover pixels
+ len &= ~3;
+ x1 += len;
+ out += mOutstep * len;
+ in += mInstep * len;
+ }
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+ else {
+ if (mLastKey.u.inType == RS_TYPE_FLOAT_32 ||
+ mLastKey.u.outType == RS_TYPE_FLOAT_32) {
+ // Currently this generates off by one errors.
+ // rsdIntrinsicColorMatrix_float_K(out, in, len, &mFnTab, tmpFp, tmpFpa);
+ // x1 += len;
+ // out += outstep * len;
+ // in += instep * len;
+ } else {
+ rsdIntrinsicColorMatrix_int_K(out, in, len, &mFnTab, mIp, mIpa);
+ x1 += len;
+ out += mOutstep * len;
+ in += mInstep * len;
+ }
+ }
+#endif
+ }
+
+ while(x1 != x2) {
+ One(out, in, mTmpFp, mTmpFpa, vsin, vsout, floatIn, floatOut);
+ out += mOutstep;
+ in += mInstep;
+ x1++;
+ }
+ }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+void ColorMatrixTask::preLaunch(size_t inVectorSize, int inType, size_t outVectorSize,
+ int outType) {
+ if (inType == outType) {
+ if (outType == RS_TYPE_UNSIGNED_8) {
+ updateCoeffCache(1.f, 255.f);
+ } else {
+ updateCoeffCache(1.f, 1.f);
+ }
+ } else {
+ if (outType == RS_TYPE_UNSIGNED_8) {
+ updateCoeffCache(255.f, 255.f);
+ } else {
+ updateCoeffCache(1.f / 255.f, 1.f);
+ }
+ }
+
+ Key_t key = computeKey(inVectorSize, inType, outVectorSize, outType);
+#else
+void ColorMatrixTask::preLaunch(size_t inVectorSize, size_t outVectorSize) {
+ updateCoeffCache(1.f, 255.f);
+
+ Key_t key = computeKey(inVectorSize, outVectorSize);
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+ if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
+ // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
+ // mOptKernel =
+ // (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
+ mLastKey = key;
+ }
+
+#else //if !defined(ARCH_X86_HAVE_SSSE3)
+ if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
+ if (mBuf) munmap(mBuf, mBufSize);
+ mBuf = nullptr;
+ mOptKernel = nullptr;
+ if (build(key)) {
+ mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
+ }
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+ else {
+ int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
+ int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
+ uint32_t mm = 0;
+ int i;
+ for (i = 0; i < 4; i++)
+ {
+ uint32_t m = (key.u.coeffMask >> i) & 0x1111;
+ m = ((m * 0x249) >> 9) & 15;
+ m |= ((key.u.addMask >> i) & 1) << 4;
+ mm |= m << (i * 5);
+ }
+
+ if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
+ rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
+ } else {
+ rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
+ }
+ }
+#endif
+ mLastKey = key;
+ }
+#endif //if !defined(ARCH_X86_HAVE_SSSE3)
+}
+
+void ColorMatrixTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ for (size_t y = startY; y < endY; y++) {
+ size_t offset = mSizeX * y + startX;
+ uchar* in = ((uchar*)mIn) + offset * paddedSize(mInputVectorSize);
+ uchar* out = ((uchar*)mOut) + offset * paddedSize(mVectorSize);
+ kernel(out, in, startX, endX);
+ }
+}
+
+static const float fourZeroes[]{0.0f, 0.0f, 0.0f, 0.0f};
+
+void RenderScriptToolkit::colorMatrix(const void* in, void* out, size_t inputVectorSize,
+ size_t outputVectorSize, size_t sizeX, size_t sizeY,
+ const float* matrix, const float* addVector,
+ const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+ return;
+ }
+ if (inputVectorSize < 1 || inputVectorSize > 4) {
+ ALOGE("The inputVectorSize should be between 1 and 4. %zu provided.", inputVectorSize);
+ return;
+ }
+ if (outputVectorSize < 1 || outputVectorSize > 4) {
+ ALOGE("The outputVectorSize should be between 1 and 4. %zu provided.", outputVectorSize);
+ return;
+ }
+#endif
+
+ if (addVector == nullptr) {
+ addVector = fourZeroes;
+ }
+ ColorMatrixTask task(in, out, inputVectorSize, outputVectorSize, sizeX, sizeY, matrix,
+ addVector, restriction);
+ processor->doTask(&task);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/ColorMatrix_advsimd.S b/toolkit/ColorMatrix_advsimd.S
new file mode 100644
index 0000000..55b0029
--- /dev/null
+++ b/toolkit/ColorMatrix_advsimd.S
@@ -0,0 +1,1277 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro vmxx_f32 i, mask, opd, opa, opb
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1)
+ fmla \opd, \opa, \opb
+ .else
+ fmul \opd, \opa, \opb
+ .endif
+ .endif
+.endm
+
+.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1)
+ fadd \opd, \opa, \opb
+ .else
+ mov \stupidsyntax1, \stupidsyntax2
+ .endif
+ .endif
+.endm
+
+.macro vmxx_s16 i, mask, opd, opa, opb
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1 + 16)
+ smlal \opd, \opa, \opb
+ .else
+ smull \opd, \opa, \opb
+ .endif
+ .endif
+.endm
+
+.macro vmxx2_s16 i, mask, opd, opa, opb
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1 + 16)
+ smlal2 \opd, \opa, \opb
+ .else
+ smull2 \opd, \opa, \opb
+ .endif
+ .endif
+.endm
+
+/* x0 = dst
+ * x1 = src
+ * x2 = count
+ * x3 = params
+ * x4 = column0_fn
+ * x5 = column1_fn
+ * x6 = column2_fn
+ * x7 = column3_fn
+ * x8 = store_fn
+ * x9 = load_fn
+ */
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+.align 6
+colormatrix_int_col0_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[0]
+ dup v7.4s, v4.s[0]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[4]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[0]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[4]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[0]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[4]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[0]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[4]
+ sqshrun v8.4h, v6.4s, #8
+ sqshrun2 v8.8h, v7.4s, #8
+ br x5
+
+colormatrix_int_col0_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[0]
+ dup v7.4s, v4.s[0]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[0]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[4]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[0]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[4]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[0]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[4]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[0]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[4]
+ sqshrun v8.4h, v6.4s, #8
+ sqshrun2 v8.8h, v7.4s, #8
+ br x5
+
+.align 6
+colormatrix_int_col1_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[1]
+ dup v7.4s, v4.s[1]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[1]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[5]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[1]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[5]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[1]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[5]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[1]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[5]
+ sqshrun v9.4h, v6.4s, #8
+ sqshrun2 v9.8h, v7.4s, #8
+ br x6
+
+colormatrix_int_col1_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[1]
+ dup v7.4s, v4.s[1]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[1]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[5]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[1]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[5]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[1]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[5]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[1]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[5]
+ sqshrun v9.4h, v6.4s, #8
+ sqshrun2 v9.8h, v7.4s, #8
+ br x6
+
+.align 6
+colormatrix_int_col2_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[2]
+ dup v7.4s, v4.s[2]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[2]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[6]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[2]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[6]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[2]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[6]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[2]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[6]
+ sqshrun v10.4h, v6.4s, #8
+ sqshrun2 v10.8h, v7.4s, #8
+ br x7
+
+colormatrix_int_col2_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[2]
+ dup v7.4s, v4.s[2]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[2]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[6]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[2]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[6]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[2]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[6]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[2]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[6]
+ sqshrun v10.4h, v6.4s, #8
+ sqshrun2 v10.8h, v7.4s, #8
+ br x7
+
+.align 6
+colormatrix_int_col3_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[3]
+ dup v7.4s, v4.s[3]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[3]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[7]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[3]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[7]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[3]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[7]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[3]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[7]
+ sqshrun v11.4h, v6.4s, #8
+ sqshrun2 v11.8h, v7.4s, #8
+ br x8
+
+colormatrix_int_col3_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[3]
+ dup v7.4s, v4.s[3]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[3]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[7]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[3]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[7]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[3]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[7]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[3]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[7]
+ sqshrun v11.4h, v6.4s, #8
+ sqshrun2 v11.8h, v7.4s, #8
+ br x8
+
+.align 5
+colormatrix_float_col0_\i:
+ vmxx_f32 \i, 1, v8.4s, v12.4s, v0.s[0]
+ vmxx_f32 \i, 2, v8.4s, v13.4s, v1.s[0]
+ vmxx_f32 \i, 4, v8.4s, v14.4s, v2.s[0]
+ vmxx_f32 \i, 8, v8.4s, v15.4s, v3.s[0]
+ vadd_f32 \i, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
+ vmxx_f32 \i, 1, v16.4s, v20.4s, v0.s[0]
+ vmxx_f32 \i, 2, v16.4s, v21.4s, v1.s[0]
+ vmxx_f32 \i, 4, v16.4s, v22.4s, v2.s[0]
+ vmxx_f32 \i, 8, v16.4s, v23.4s, v3.s[0]
+ vadd_f32 \i, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
+ br x5
+
+.align 4
+colormatrix_float_col0_n\i:
+ vmxx_f32 \i^31, 1, v8.4s, v12.4s, v0.s[0]
+ vmxx_f32 \i^31, 2, v8.4s, v13.4s, v1.s[0]
+ vmxx_f32 \i^31, 4, v8.4s, v14.4s, v2.s[0]
+ vmxx_f32 \i^31, 8, v8.4s, v15.4s, v3.s[0]
+ vadd_f32 \i^31, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
+ vmxx_f32 \i^31, 1, v16.4s, v20.4s, v0.s[0]
+ vmxx_f32 \i^31, 2, v16.4s, v21.4s, v1.s[0]
+ vmxx_f32 \i^31, 4, v16.4s, v22.4s, v2.s[0]
+ vmxx_f32 \i^31, 8, v16.4s, v23.4s, v3.s[0]
+ vadd_f32 \i^31, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
+ br x5
+
+.align 5
+colormatrix_float_col1_\i:
+ vmxx_f32 \i, 1, v9.4s, v12.4s, v0.s[1]
+ vmxx_f32 \i, 2, v9.4s, v13.4s, v1.s[1]
+ vmxx_f32 \i, 4, v9.4s, v14.4s, v2.s[1]
+ vmxx_f32 \i, 8, v9.4s, v15.4s, v3.s[1]
+ vadd_f32 \i, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
+ vmxx_f32 \i, 1, v17.4s, v20.4s, v0.s[1]
+ vmxx_f32 \i, 2, v17.4s, v21.4s, v1.s[1]
+ vmxx_f32 \i, 4, v17.4s, v22.4s, v2.s[1]
+ vmxx_f32 \i, 8, v17.4s, v23.4s, v3.s[1]
+ vadd_f32 \i, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
+ br x6
+
+.align 4
+colormatrix_float_col1_n\i:
+ vmxx_f32 \i^31, 1, v9.4s, v12.4s, v0.s[1]
+ vmxx_f32 \i^31, 2, v9.4s, v13.4s, v1.s[1]
+ vmxx_f32 \i^31, 4, v9.4s, v14.4s, v2.s[1]
+ vmxx_f32 \i^31, 8, v9.4s, v15.4s, v3.s[1]
+ vadd_f32 \i^31, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
+ vmxx_f32 \i^31, 1, v17.4s, v20.4s, v0.s[1]
+ vmxx_f32 \i^31, 2, v17.4s, v21.4s, v1.s[1]
+ vmxx_f32 \i^31, 4, v17.4s, v22.4s, v2.s[1]
+ vmxx_f32 \i^31, 8, v17.4s, v23.4s, v3.s[1]
+ vadd_f32 \i^31, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
+ br x6
+
+.align 5
+colormatrix_float_col2_\i:
+ vmxx_f32 \i, 1, v10.4s, v12.4s, v0.s[2]
+ vmxx_f32 \i, 2, v10.4s, v13.4s, v1.s[2]
+ vmxx_f32 \i, 4, v10.4s, v14.4s, v2.s[2]
+ vmxx_f32 \i, 8, v10.4s, v15.4s, v3.s[2]
+ vadd_f32 \i, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
+ vmxx_f32 \i, 1, v18.4s, v20.4s, v0.s[2]
+ vmxx_f32 \i, 2, v18.4s, v21.4s, v1.s[2]
+ vmxx_f32 \i, 4, v18.4s, v22.4s, v2.s[2]
+ vmxx_f32 \i, 8, v18.4s, v23.4s, v3.s[2]
+ vadd_f32 \i, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
+ br x7
+
+.align 4
+colormatrix_float_col2_n\i:
+ vmxx_f32 \i^31, 1, v10.4s, v12.4s, v0.s[2]
+ vmxx_f32 \i^31, 2, v10.4s, v13.4s, v1.s[2]
+ vmxx_f32 \i^31, 4, v10.4s, v14.4s, v2.s[2]
+ vmxx_f32 \i^31, 8, v10.4s, v15.4s, v3.s[2]
+ vadd_f32 \i^31, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
+ vmxx_f32 \i^31, 1, v18.4s, v20.4s, v0.s[2]
+ vmxx_f32 \i^31, 2, v18.4s, v21.4s, v1.s[2]
+ vmxx_f32 \i^31, 4, v18.4s, v22.4s, v2.s[2]
+ vmxx_f32 \i^31, 8, v18.4s, v23.4s, v3.s[2]
+ vadd_f32 \i^31, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
+ br x7
+
+.align 5
+colormatrix_float_col3_\i:
+ vmxx_f32 \i, 1, v11.4s, v12.4s, v0.s[3]
+ vmxx_f32 \i, 2, v11.4s, v13.4s, v1.s[3]
+ vmxx_f32 \i, 4, v11.4s, v14.4s, v2.s[3]
+ vmxx_f32 \i, 8, v11.4s, v15.4s, v3.s[3]
+ vadd_f32 \i, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
+ vmxx_f32 \i, 1, v19.4s, v20.4s, v0.s[3]
+ vmxx_f32 \i, 2, v19.4s, v21.4s, v1.s[3]
+ vmxx_f32 \i, 4, v19.4s, v22.4s, v2.s[3]
+ vmxx_f32 \i, 8, v19.4s, v23.4s, v3.s[3]
+ vadd_f32 \i, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
+ br x8
+
+.align 4
+colormatrix_float_col3_n\i:
+ vmxx_f32 \i^31, 1, v11.4s, v12.4s, v0.s[3]
+ vmxx_f32 \i^31, 2, v11.4s, v13.4s, v1.s[3]
+ vmxx_f32 \i^31, 4, v11.4s, v14.4s, v2.s[3]
+ vmxx_f32 \i^31, 8, v11.4s, v15.4s, v3.s[3]
+ vadd_f32 \i^31, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
+ vmxx_f32 \i^31, 1, v19.4s, v20.4s, v0.s[3]
+ vmxx_f32 \i^31, 2, v19.4s, v21.4s, v1.s[3]
+ vmxx_f32 \i^31, 4, v19.4s, v22.4s, v2.s[3]
+ vmxx_f32 \i^31, 8, v19.4s, v23.4s, v3.s[3]
+ vadd_f32 \i^31, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
+ br x8
+
+.endr
+
+.align 6
+colormatrix_float_ldu4:
+ ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v23.8h, v23.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl v15.4s, v23.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ uxtl2 v23.4s, v23.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v15.4s, v15.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ ucvtf v23.4s, v23.4s
+ br x4
+
+.align 5
+colormatrix_int_ldu4:
+ ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ uxtl v15.8h, v15.8b
+ br x4
+
+.align 6
+colormatrix_float_ldu3:
+ ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ br x4
+
+colormatrix_int_ldu3:
+ ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ br x4
+
+.align 5
+colormatrix_float_ldu1:
+ ld1 {v20.8b}, [x1], #8
+ uxtl v20.8h, v20.8b
+ uxtl v12.4s, v20.4h
+ uxtl2 v20.4s, v20.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v20.4s, v20.4s
+ br x4
+
+.align 6
+colormatrix_float_ldu2:
+ ld2 {v20.8b,v21.8b}, [x1], #16
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ br x4
+
+.align 4
+colormatrix_int_ldu2:
+ ld2 {v12.8b,v13.8b}, [x1], #16
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ br x4
+
+.align 6
+colormatrix_float_stu4:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v27.4s, v11.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ fcvtzs v31.4s, v19.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun v27.4h, v27.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ sqrshrun2 v27.8h, v31.4s, #1
+ uqxtn v24.8b, v24.8h
+ uqxtn v25.8b, v25.8h
+ uqxtn v26.8b, v26.8h
+ uqxtn v27.8b, v27.8h
+ subs x2, x2, #8
+ st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+ blo colormatrix_float_end
+ br x9
+
+.align 5
+colormatrix_int_stu4:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ uqxtn v15.8b, v11.8h
+ subs x2, x2, #8
+ st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+ blo colormatrix_int_end
+ br x9
+
+.align 6
+colormatrix_float_stu3:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ uqxtn v24.8b, v24.8h
+ uqxtn v25.8b, v25.8h
+ uqxtn v26.8b, v26.8h
+ movi v27.8b, #0
+ subs x2, x2, #8
+ st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+ blo colormatrix_float_end
+ br x9
+
+.align 4
+colormatrix_int_ldu1:
+ ld1 {v12.8b}, [x1], #8
+ uxtl v12.8h, v12.8b
+ br x4
+
+.align 5
+colormatrix_int_stu3:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ movi v15.8b, #0
+ subs x2, x2, #8
+ st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+ blo colormatrix_int_end
+ br x9
+
+.align 6
+colormatrix_float_stu2:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ uqxtn v24.8b, v24.8h
+ uqxtn v25.8b, v25.8h
+ subs x2, x2, #8
+ st2 {v24.8b,v25.8b}, [x0], #16
+ blo colormatrix_float_end
+ br x9
+
+.align 5
+colormatrix_int_stu2:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ subs x2, x2, #8
+ st2 {v12.8b,v13.8b}, [x0], #16
+ blo colormatrix_int_end
+ br x9
+
+.align 5
+colormatrix_int_stu1:
+ uqxtn v12.8b, v8.8h
+ subs x2, x2, #8
+ st1 {v12.8b}, [x0], #8
+ blo colormatrix_int_end
+ br x9
+
+colormatrix_float_ldf3:
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+ ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+ br x4
+
+.align 6
+colormatrix_float_stu1:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ uqxtn v24.8b, v24.8h
+ subs x2, x2, #8
+ st1 {v24.8b}, [x0], #8
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_stf3:
+ movi v11.16b, #0
+ st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+ movi v19.16b, #0
+ subs x2, x2, #8
+ st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ blo colormatrix_float_end
+ br x9
+
+.align 5
+colormatrix_float_stf4:
+ st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+ subs x2, x2, #8
+ st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_ldf4:
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+ ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+ br x4
+
+.align 5
+colormatrix_float_stf2:
+ st2 {v8.4s, v9.4s}, [x0], #32
+ subs x2, x2, #8
+ st2 {v16.4s, v17.4s}, [x0], #32
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_ldf2:
+ ld2 {v12.4s,v13.4s}, [x1], #32
+ ld2 {v20.4s,v21.4s}, [x1], #32
+ br x4
+
+.align 5
+colormatrix_float_stf1:
+ st1 {v8.4s}, [x0], #16
+ subs x2, x2, #8
+ st1 {v16.4s}, [x0], #16
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_ldf1:
+ ld1 {v12.4s}, [x1], #16
+ ld1 {v20.4s}, [x1], #16
+ br x4
+
+colormatrix_int_stu1_end:
+ uqxtn v12.8b, v8.8h
+ tbz x2, #2, 1f
+ st1 {v12.s}[1], [x0], #4
+1: tbz x2, #1, 1f
+ st1 {v12.h}[1], [x0], #2
+1: tbz x2, #0, 1f
+ st1 {v12.b}[1], [x0], #1
+1: b colormatrix_int_realend
+
+colormatrix_int_stu2_end:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ zip1 v12.16b, v12.16b, v13.16b
+ tbz x2, #2, 1f
+ st1 {v12.d}[1], [x0], #8
+1: tbz x2, #1, 1f
+ st1 {v12.s}[1], [x0], #4
+1: tbz x2, #0, 1f
+ st1 {v12.h}[1], [x0], #2
+1: b colormatrix_int_realend
+
+colormatrix_int_stu3_end:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ movi v15.8b, #0
+ tbz x2, #2, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1: tbz x2, #1, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1: tbz x2, #0, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1: b colormatrix_int_realend
+
+colormatrix_int_stu4_end:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ uqxtn v15.8b, v11.8h
+ tbz x2, #2, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1: tbz x2, #1, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1: tbz x2, #0, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1: b colormatrix_int_realend
+
+
+colormatrix_int_ldu1_end:
+ tbz x2, #2, 1f
+ ld1 {v15.s}[3], [x1], #4
+1: tbz x2, #1, 1f
+ ld1 {v15.h}[5], [x1], #2
+1: tbz x2, #0, 1f
+ ld1 {v15.b}[9], [x1], #1
+1: uxtl2 v12.8h, v15.16b
+ br x4
+
+colormatrix_int_ldu2_end:
+ tbz x2, #2, 1f
+ ld1 {v15.d}[1], [x1], #8
+1: tbz x2, #1, 1f
+ ld1 {v15.s}[1], [x1], #4
+1: tbz x2, #0, 1f
+ ld1 {v15.h}[1], [x1], #2
+1: uzp1 v14.16b, v15.16b, v15.16b
+ uzp2 v15.16b, v15.16b, v15.16b
+ uxtl v12.8h, v14.8b
+ uxtl v13.8h, v15.8b
+ br x4
+
+colormatrix_int_ldu3_end:
+ tbz x2, #2, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1: tbz x2, #1, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1: tbz x2, #0, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1: uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ br x4
+
+colormatrix_int_ldu4_end:
+ tbz x2, #2, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1: tbz x2, #1, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1: tbz x2, #0, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1: uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ uxtl v15.8h, v15.8b
+ br x4
+
+colormatrix_float_stu1_end:
+ fcvtzs v12.4s, v8.4s, #1
+ fcvtzs v13.4s, v16.4s, #1
+ sqrshrun v12.4h, v12.4s, #1
+ sqrshrun2 v12.8h, v13.4s, #1
+ uqxtn v12.8b, v12.8h
+ tbz x2, #2, 1f
+ st1 {v12.s}[1], [x0], #4
+1: tbz x2, #1, 1f
+ st1 {v12.h}[1], [x0], #2
+1: tbz x2, #0, 1f
+ st1 {v12.b}[1], [x0], #1
+1: b colormatrix_float_realend
+
+colormatrix_float_stu2_end:
+ fcvtzs v12.4s, v8.4s, #1
+ fcvtzs v13.4s, v9.4s, #1
+ fcvtzs v14.4s, v16.4s, #1
+ fcvtzs v15.4s, v17.4s, #1
+ sqrshrun v12.4h, v12.4s, #1
+ sqrshrun v13.4h, v13.4s, #1
+ sqrshrun v14.4h, v14.4s, #1
+ sqrshrun v15.4h, v15.4s, #1
+ zip1 v12.8h, v12.8h, v13.8h
+ zip1 v13.8h, v14.8h, v15.8h
+ uqxtn v12.8b, v12.8h
+ uqxtn2 v12.16b, v13.8h
+ tbz x2, #2, 1f
+ st1 {v12.d}[1], [x0], #8
+1: tbz x2, #1, 1f
+ st1 {v12.s}[1], [x0], #4
+1: tbz x2, #0, 1f
+ st1 {v12.h}[1], [x0], #2
+1: b colormatrix_float_realend
+
+colormatrix_float_stu3_end:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ uqxtn v12.8b, v24.8h
+ uqxtn v13.8b, v25.8h
+ uqxtn v14.8b, v26.8h
+ movi v15.8b, #0
+ tbz x2, #2, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1: tbz x2, #1, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1: tbz x2, #0, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1: b colormatrix_float_realend
+
+colormatrix_float_stu4_end:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v27.4s, v11.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ fcvtzs v31.4s, v19.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun v27.4h, v27.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ sqrshrun2 v27.8h, v31.4s, #1
+ uqxtn v12.8b, v24.8h
+ uqxtn v13.8b, v25.8h
+ uqxtn v14.8b, v26.8h
+ uqxtn v15.8b, v27.8h
+ tbz x2, #2, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1: tbz x2, #1, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1: tbz x2, #0, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1: b colormatrix_float_realend
+
+colormatrix_float_stf1_end:
+ tbz x2, #2, 1f
+ st1 {v16.4s}, [x0], #16
+1: tbz x2, #1, 1f
+ st1 {v8.d}[1], [x0], #8
+1: tbz x2, #0, 1f
+ st1 {v8.s}[1], [x0], #4
+1: b colormatrix_float_realend
+
+colormatrix_float_stf2_end:
+ tbz x2, #2, 1f
+ st2 {v16.4s, v17.4s}, [x0], #32
+1: tbz x2, #1, 1f
+ st2 {v8.s,v9.s}[2], [x0], #8
+ st2 {v8.s,v9.s}[3], [x0], #8
+1: tbz x2, #0, 1f
+ st2 {v8.s,v9.s}[1], [x0], #8
+1: b colormatrix_float_realend
+
+colormatrix_float_stf3_end:
+ movi v11.16b, #0
+ movi v19.16b, #0
+colormatrix_float_stf4_end:
+ tbz x2, #2, 1f
+ st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+1: tbz x2, #1, 1f
+ st4 {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
+ st4 {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
+1: tbz x2, #0, 1f
+ st4 {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
+1: b colormatrix_float_realend
+
+colormatrix_float_ldu1_end:
+ tbz x2, #2, 1f
+ ld1 {v15.s}[1], [x1], #4
+1: tbz x2, #1, 1f
+ ld1 {v15.h}[1], [x1], #2
+1: tbz x2, #0, 1f
+ ld1 {v15.b}[1], [x1], #1
+1: uxtl v15.8h, v15.8b
+ uxtl v12.4s, v15.4h
+ uxtl2 v20.4s, v15.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v20.4s, v20.4s
+ br x4
+
+colormatrix_float_ldu2_end:
+ tbz x2, #2, 1f
+ ld1 {v15.d}[1], [x1], #8
+1: tbz x2, #1, 1f
+ ld1 {v15.s}[1], [x1], #4
+1: tbz x2, #0, 1f
+ ld1 {v15.h}[1], [x1], #2
+1: uxtl v14.8h, v15.8b
+ uxtl2 v15.8h, v15.16b
+ uzp1 v12.8h, v14.8h, v14.8h
+ uzp2 v13.8h, v14.8h, v14.8h
+ uzp1 v20.8h, v15.8h, v15.8h
+ uzp2 v21.8h, v15.8h, v15.8h
+ uxtl v12.4s, v12.4h
+ uxtl v13.4s, v13.4h
+ uxtl v20.4s, v20.4h
+ uxtl v21.4s, v21.4h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ br x4
+
+colormatrix_float_ldu3_end:
+ tbz x2, #2, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1: tbz x2, #1, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1: tbz x2, #0, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1: uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ br x4
+
+colormatrix_float_ldu4_end:
+ tbz x2, #2, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1: tbz x2, #1, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1: tbz x2, #0, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1: uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v23.8h, v23.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl v15.4s, v23.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ uxtl2 v23.4s, v23.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v15.4s, v15.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ ucvtf v23.4s, v23.4s
+ br x4
+
+colormatrix_float_ldf1_end:
+ tbz x2, #2, 1f
+ ld1 {v20.4s}, [x1], #16
+1: tbz x2, #1, 1f
+ ld1 {v12.d}[1], [x1], #8
+1: tbz x2, #0, 1f
+ ld1 {v12.s}[1], [x1], #4
+1: br x4
+
+colormatrix_float_ldf2_end:
+ tbz x2, #2, 1f
+ ld2 {v20.4s,v21.4s}, [x1], #32
+1: tbz x2, #1, 1f
+ ld2 {v12.s,v13.s}[2], [x1], #8
+ ld2 {v12.s,v13.s}[3], [x1], #8
+1: tbz x2, #0, 1f
+ ld2 {v12.s,v13.s}[1], [x1], #8
+1: br x4
+
+colormatrix_float_ldf3_end:
+colormatrix_float_ldf4_end:
+ tbz x2, #2, 1f
+ ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+1: tbz x2, #1, 1f
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
+1: tbz x2, #0, 1f
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
+1: br x4
+
+/* void rsdIntrinsicColorMatrix_int_K(
+ * void *out, // x0
+ * void const *in, // x1
+ * size_t count, // x2
+ * fntab_t const *fns, // x3
+ * int16_t const *mult, // x4
+ * int32_t const *add); // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_int_K)
+ sub x7, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d-v11.1d}, [sp]
+ st1 {v12.1d-v15.1d}, [x7]
+
+ ld1 {v0.8h,v1.8h}, [x4], #32
+ ld1 {v4.4s}, [x5], #16
+
+ ldp x4,x5, [x3],#16
+ ldp x6,x7, [x3],#16
+ ldp x8,x9, [x3],#16
+
+ dup v12.4s, v4.s[0]
+ dup v13.4s, v4.s[1]
+ dup v14.4s, v4.s[2]
+ dup v15.4s, v4.s[3]
+ sqshrun v8.4h, v12.4s, #8
+ sqshrun2 v8.8h, v12.4s, #8
+ sqshrun v9.4h, v13.4s, #8
+ sqshrun2 v9.8h, v13.4s, #8
+ sqshrun v10.4h, v14.4s, #8
+ sqshrun2 v10.8h, v14.4s, #8
+ sqshrun v11.4h, v15.4s, #8
+ sqshrun2 v11.8h, v15.4s, #8
+
+ subs x2, x2, #8
+ blo colormatrix_int_end
+ br x9
+
+colormatrix_int_end:
+ adds x2, x2, #8
+ bls colormatrix_int_realend
+ mov x16, x8
+ ldp x8, x9, [x3], #16
+ cmp x4, x16
+ csel x4, x8, x4, eq
+ cmp x5, x16
+ csel x5, x8, x5, eq
+ cmp x6, x16
+ csel x6, x8, x6, eq
+ cmp x7, x16
+ csel x7, x8, x7, eq
+ br x9
+
+colormatrix_int_realend:
+ ld1 {v8.1d-v11.1d}, [sp], #32
+ ld1 {v12.1d-v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicColorMatrix_int_K)
+
+/* void rsdIntrinsicColorMatrixSetup_int_K(
+ * fntab_t const *fns, // x0
+ * uint32_t mask, // x1
+ * int dt, // x2
+ * int st); // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
+ adrp x7, 2f
+ add x7, x7, :lo12:2f
+ add x4, x7, x2, LSL #3
+ ldrsw x2, [x4], #4
+ ldrsw x4, [x4]
+ add x2, x2, x7
+ add x4, x4, x7
+ adrp x7, 3f
+ add x7, x7, :lo12:3f
+ add x5, x7, x3, LSL #3
+ ldrsw x3, [x5], #4
+ ldrsw x5, [x5]
+ add x3, x3, x7
+ add x5, x5, x7
+ stp x2, x3, [x0, #32]
+ stp x4, x5, [x0, #48]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+ mov x3, #4
+ adrp x7, 4f
+ add x7, x7, :lo12:4f
+1: ands x2, x1, #15
+ beq 9f
+ and x2, x1, #31
+ lsl x2, x2, #4
+ ldrsw x2, [x7, x2]
+ add x2, x2, x7
+9: str x2, [x0], #8
+ lsr x1, x1, #5
+ add x7, x7, #4
+ subs x3, x3, #1
+ bne 1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+ ldr x2, [x0]
+ mov x3, #4
+1: ldr x1, [x0, #-8]!
+ cmp x1, #0
+ csel x2, x1, x2, ne
+ str x2, [x0]
+ subs x3, x3, #1
+ bne 1b
+ ret
+
+END(rsdIntrinsicColorMatrixSetup_int_K)
+.rodata
+ .align 4
+2: .word colormatrix_int_stu1-2b
+ .word colormatrix_int_stu1_end-2b
+ .word colormatrix_int_stu2-2b
+ .word colormatrix_int_stu2_end-2b
+ .word colormatrix_int_stu3-2b
+ .word colormatrix_int_stu3_end-2b
+ .word colormatrix_int_stu4-2b
+ .word colormatrix_int_stu4_end-2b
+3: .word colormatrix_int_ldu1-3b
+ .word colormatrix_int_ldu1_end-3b
+ .word colormatrix_int_ldu2-3b
+ .word colormatrix_int_ldu2_end-3b
+ .word colormatrix_int_ldu3-3b
+ .word colormatrix_int_ldu3_end-3b
+ .word colormatrix_int_ldu4-3b
+ .word colormatrix_int_ldu4_end-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .word colormatrix_int_col0_\i-4b
+ .word colormatrix_int_col1_\i-4b-4
+ .word colormatrix_int_col2_\i-4b-8
+ .word colormatrix_int_col3_\i-4b-12
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ .word colormatrix_int_col0_n\i-4b
+ .word colormatrix_int_col1_n\i-4b-4
+ .word colormatrix_int_col2_n\i-4b-8
+ .word colormatrix_int_col3_n\i-4b-12
+.endr
+
+
+/* void rsdIntrinsicColorMatrix_float_K(
+ * void *out, // x0
+ * void const *in, // x1
+ * size_t count, // x2
+ * fntab_t const *fns, // x3
+ * float const *mult, // x4
+ * float const *add); // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_float_K)
+ sub x7, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d-v11.1d}, [sp]
+ st1 {v12.1d-v15.1d}, [x7]
+
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
+ ld1r {v4.4s}, [x5], #4
+ ld1r {v5.4s}, [x5], #4
+ ld1r {v6.4s}, [x5], #4
+ ld1r {v7.4s}, [x5], #4
+
+ ldp x4,x5, [x3], #16
+ ldp x6,x7, [x3], #16
+ ldp x8,x9, [x3], #16
+
+ mov v8.16b, v4.16b
+ mov v9.16b, v5.16b
+ mov v10.16b, v6.16b
+ mov v11.16b, v7.16b
+
+ mov v16.16b, v4.16b
+ mov v17.16b, v5.16b
+ mov v18.16b, v6.16b
+ mov v19.16b, v7.16b
+
+ subs x2, x2, #8
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_end:
+ adds x2, x2, #8
+ bls colormatrix_int_realend
+ mov x16, x8
+ ldp x8,x9, [x3], #16
+ cmp x4, x16
+ csel x4, x8, x4, eq
+ cmp x5, x16
+ csel x5, x8, x5, eq
+ cmp x6, x16
+ csel x6, x8, x6, eq
+ cmp x7, x16
+ csel x7, x8, x7, eq
+ br x9
+
+colormatrix_float_realend:
+ ld1 {v8.1d-v11.1d}, [sp], #32
+ ld1 {v12.1d-v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicColorMatrix_float_K)
+
+/* void rsdIntrinsicColorMatrixSetup_float_K(
+ * fntab_t const *fns, // x0
+ * uint32_t mask, // x1
+ * int dt, // x2
+ * int st); // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
+ adrp x7, 2f
+ add x7, x7, :lo12:2f
+ add x4, x7, x2, LSL #3
+ ldrsw x2, [x4], #4
+ ldrsw x4, [x4]
+ add x2, x2, x7
+ add x4, x4, x7
+ adrp x7, 3f
+ add x7, x7, :lo12:3f
+ add x5, x7, x3, LSL #3
+ ldrsw x3, [x5], #4
+ ldrsw x5, [x5]
+ add x3, x3, x7
+ add x5, x5, x7
+ stp x2, x3, [x0, #32]
+ stp x4, x5, [x0, #48]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+ mov x3, #4
+ adrp x7, 4f
+ add x7, x7, :lo12:4f
+1: ands x2, x1, #15
+ beq 9f
+ and x2, x1, #31
+ lsl x2, x2, #4
+ ldrsw x2, [x7, x2]
+ add x2, x2, x7
+9: str x2, [x0], #8
+ lsr x1, x1, #5
+ add x7, x7, #4
+ subs x3, x3, #1
+ bne 1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+ ldr x2, [x0]
+ mov x3, #4
+1: ldr x1, [x0, #-8]!
+ cmp x1, #0
+ csel x2, x1, x2, ne
+ str x2, [x0]
+ subs x3, x3, #1
+ bne 1b
+ ret
+
+END(rsdIntrinsicColorMatrixSetup_float_K)
+.rodata
+ .align 4
+2: .word colormatrix_float_stu1-2b
+ .word colormatrix_float_stu1_end-2b
+ .word colormatrix_float_stu2-2b
+ .word colormatrix_float_stu2_end-2b
+ .word colormatrix_float_stu3-2b
+ .word colormatrix_float_stu3_end-2b
+ .word colormatrix_float_stu4-2b
+ .word colormatrix_float_stu4_end-2b
+ .word colormatrix_float_stf1-2b
+ .word colormatrix_float_stf1_end-2b
+ .word colormatrix_float_stf2-2b
+ .word colormatrix_float_stf2_end-2b
+ .word colormatrix_float_stf3-2b
+ .word colormatrix_float_stf3_end-2b
+ .word colormatrix_float_stf4-2b
+ .word colormatrix_float_stf4_end-2b
+3: .word colormatrix_float_ldu1-3b
+ .word colormatrix_float_ldu1_end-3b
+ .word colormatrix_float_ldu2-3b
+ .word colormatrix_float_ldu2_end-3b
+ .word colormatrix_float_ldu3-3b
+ .word colormatrix_float_ldu3_end-3b
+ .word colormatrix_float_ldu4-3b
+ .word colormatrix_float_ldu4_end-3b
+ .word colormatrix_float_ldf1-3b
+ .word colormatrix_float_ldf1_end-3b
+ .word colormatrix_float_ldf2-3b
+ .word colormatrix_float_ldf2_end-3b
+ .word colormatrix_float_ldf3-3b
+ .word colormatrix_float_ldf3_end-3b
+ .word colormatrix_float_ldf4-3b
+ .word colormatrix_float_ldf4_end-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .word colormatrix_float_col0_\i-4b
+ .word colormatrix_float_col1_\i-4b-4
+ .word colormatrix_float_col2_\i-4b-8
+ .word colormatrix_float_col3_\i-4b-12
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ .word colormatrix_float_col0_n\i-4b
+ .word colormatrix_float_col1_n\i-4b-4
+ .word colormatrix_float_col2_n\i-4b-8
+ .word colormatrix_float_col3_n\i-4b-12
+.endr
diff --git a/toolkit/ColorMatrix_neon.S b/toolkit/ColorMatrix_neon.S
new file mode 100644
index 0000000..ecb8c13
--- /dev/null
+++ b/toolkit/ColorMatrix_neon.S
@@ -0,0 +1,361 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define SNIP_START(x) \
+ .globl x; x:
+
+#define SNIP_END(x) \
+ .globl x##_end; x##_end: \
+ .globl x##_len; x##_len: \
+ .word x##_end-x
+
+SNIP_START(_N_ColorMatrix_prefix_i)
+ stmfd sp!, {r4, lr}
+ vpush {q4-q7}
+ vld1.16 {q2}, [r2]!
+ vld1.16 {q3}, [r2]!
+ vld1.32 {d8[],d9[]}, [r2]!
+ vld1.32 {d10[],d11[]}, [r2]!
+ vld1.32 {d12[],d13[]}, [r2]!
+ vld1.32 {d14[],d15[]}, [r2]!
+ veor q0, q0
+ veor q1, q1
+ veor q9, q9
+ veor q10, q10
+ veor q11, q11
+SNIP_END(_N_ColorMatrix_prefix_i)
+
+SNIP_START(_N_ColorMatrix_prefix_f)
+ stmfd sp!, {r4, lr}
+ vpush {q4-q7}
+ add r2, #48
+ vld1.32 {q4}, [r2]!
+ vld1.32 {q5}, [r2]!
+ vld1.32 {q6}, [r2]!
+ vld1.32 {q7}, [r2]!
+ vld1.32 {d16[],d17[]}, [r2]!
+ vld1.32 {d18[],d19[]}, [r2]!
+ vld1.32 {d20[],d21[]}, [r2]!
+ vld1.32 {d22[],d23[]}, [r2]!
+ veor q1, q1
+ veor q2, q2
+ veor q3, q3
+SNIP_END(_N_ColorMatrix_prefix_f)
+
+SNIP_START(_N_ColorMatrix_postfix1)
+ subs r3, r3, #1
+ #bne 1b
+SNIP_END(_N_ColorMatrix_postfix1)
+
+SNIP_START(_N_ColorMatrix_postfix2)
+
+ #mov r0, #0
+ #ldr r0, [r0]
+
+ #vqadd.s32 q0,q0,q0
+ #vadd.f32 q0,q0,q0
+ #vmul.f32 q0,q0,d0[0]
+ #vmla.f32 q0,q0,d0[0]
+ #vmov q0, q0
+
+
+ vpop {q4-q7}
+ ldmfd sp!, {r4, lr}
+ bx lr
+SNIP_END(_N_ColorMatrix_postfix2)
+
+SNIP_START(_N_ColorMatrix_load_u8_4)
+ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+ vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+ vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+ vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_u8_4)
+
+SNIP_START(_N_ColorMatrix_load_u8_3)
+ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+ vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+ vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+ vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+ veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_3)
+
+SNIP_START(_N_ColorMatrix_load_u8_2)
+ vld2.8 {d0[0],d1[0]}, [r1]!
+ vld2.8 {d0[1],d1[1]}, [r1]!
+ vld2.8 {d0[2],d1[2]}, [r1]!
+ vld2.8 {d0[3],d1[3]}, [r1]!
+ veor d2, d2
+ veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_2)
+
+SNIP_START(_N_ColorMatrix_load_u8_1)
+ vld1.32 {d0[0]}, [r1]!
+ veor d1, d1
+ veor d2, d2
+ veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_1)
+
+SNIP_START(_N_ColorMatrix_load_u8f_4)
+ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+ vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+ vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+ vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+ vmovl.u8 q3, d3
+ vmovl.u8 q2, d2
+ vmovl.u8 q1, d1
+ vmovl.u8 q0, d0
+ vmovl.u16 q3, d6
+ vmovl.u16 q2, d4
+ vmovl.u16 q1, d2
+ vmovl.u16 q0, d0
+ vcvt.f32.s32 q3, q3
+ vcvt.f32.s32 q2, q2
+ vcvt.f32.s32 q1, q1
+ vcvt.f32.s32 q0, q0
+SNIP_END(_N_ColorMatrix_load_u8f_4)
+
+SNIP_START(_N_ColorMatrix_load_u8f_3)
+ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+ vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+ vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+ vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+ vmovl.u8 q2, d2
+ vmovl.u8 q1, d1
+ vmovl.u8 q0, d0
+ vmovl.u16 q2, d4
+ vmovl.u16 q1, d2
+ vmovl.u16 q0, d0
+ vcvt.f32.s32 q2, q2
+ vcvt.f32.s32 q1, q1
+ vcvt.f32.s32 q0, q0
+ veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_3)
+
+SNIP_START(_N_ColorMatrix_load_u8f_2)
+ vld2.8 {d0[0],d1[0]}, [r1]!
+ vld2.8 {d0[1],d1[1]}, [r1]!
+ vld2.8 {d0[2],d1[2]}, [r1]!
+ vld2.8 {d0[3],d1[3]}, [r1]!
+ vmovl.u8 q1, d1
+ vmovl.u8 q0, d0
+ vmovl.u16 q1, d2
+ vmovl.u16 q0, d0
+ vcvt.f32.s32 q1, q1
+ vcvt.f32.s32 q0, q0
+ veor q2, q2
+ veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_2)
+
+SNIP_START(_N_ColorMatrix_load_u8f_1)
+ vld1.32 {d0[0]}, [r1]!
+ vmovl.u8 q0, d0
+ vmovl.u16 q0, d0
+ vcvt.f32.s32 q0, q0
+ veor q1, q1
+ veor q2, q2
+ veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_1)
+
+SNIP_START(_N_ColorMatrix_load_f32_4)
+ vld4.32 {d0[0],d2[0],d4[0],d6[0]}, [r1]!
+ vld4.32 {d0[1],d2[1],d4[1],d6[1]}, [r1]!
+ vld4.32 {d1[0],d3[0],d5[0],d7[0]}, [r1]!
+ vld4.32 {d1[1],d3[1],d5[1],d7[1]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_f32_4)
+
+SNIP_START(_N_ColorMatrix_load_f32_3)
+ vld3.32 {d0[0],d2[0],d4[0]}, [r1]!
+ add r1, r1, #4
+ vld3.32 {d0[1],d2[1],d4[1]}, [r1]!
+ add r1, r1, #4
+ vld3.32 {d1[0],d3[0],d5[0]}, [r1]!
+ add r1, r1, #4
+ vld3.32 {d1[1],d3[1],d5[1]}, [r1]!
+ add r1, r1, #4
+ veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_3)
+
+SNIP_START(_N_ColorMatrix_load_f32_2)
+ vld2.32 {d0[0],d2[0]}, [r1]!
+ vld2.32 {d0[1],d2[1]}, [r1]!
+ vld2.32 {d1[0],d3[0]}, [r1]!
+ vld2.32 {d1[1],d3[1]}, [r1]!
+ veor q2, q2
+ veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_2)
+
+SNIP_START(_N_ColorMatrix_load_f32_1)
+ vld1.32 {q0}, [r1]!
+ veor q1, q1
+ veor q2, q2
+ veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_1)
+
+
+SNIP_START(_N_ColorMatrix_store_u8_4)
+#mov r0, #0
+ vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+ vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+ vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+ vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_4)
+
+SNIP_START(_N_ColorMatrix_store_u8_2)
+ vst2.8 {d0[0],d1[0]}, [r0]!
+ vst2.8 {d0[1],d1[1]}, [r0]!
+ vst2.8 {d0[2],d1[2]}, [r0]!
+ vst2.8 {d0[3],d1[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_2)
+
+SNIP_START(_N_ColorMatrix_store_u8_1)
+ vst1.32 {d0[0]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_1)
+
+
+SNIP_START(_N_ColorMatrix_store_f32u_4)
+ vcvt.s32.f32 q0, q0
+ vcvt.s32.f32 q1, q1
+ vcvt.s32.f32 q2, q2
+ vcvt.s32.f32 q3, q3
+ vqmovn.s32 d0, q0
+ vqmovn.s32 d2, q1
+ vqmovn.s32 d4, q2
+ vqmovn.s32 d6, q3
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+ vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+ vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+ vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+ #mov r0, #0
+ #ldr r0, [r0]
+
+SNIP_END(_N_ColorMatrix_store_f32u_4)
+
+SNIP_START(_N_ColorMatrix_store_f32u_2)
+ vcvt.s32.f32 q0, q0
+ vcvt.s32.f32 q1, q1
+ vqmovn.s32 d0, q0
+ vqmovn.s32 d2, q1
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst2.8 {d0[0],d1[0]}, [r0]!
+ vst2.8 {d0[1],d1[1]}, [r0]!
+ vst2.8 {d0[2],d1[2]}, [r0]!
+ vst2.8 {d0[3],d1[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32u_2)
+
+SNIP_START(_N_ColorMatrix_store_f32u_1)
+ vcvt.s32.f32 q0, q0
+ vqmovn.s32 d0, q0
+ vqmovun.s16 d0, q0
+ vst1.32 {d0[0]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32u_1)
+
+SNIP_START(_N_ColorMatrix_store_f32_4)
+ vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
+ vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
+ vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
+ vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_4)
+
+SNIP_START(_N_ColorMatrix_store_f32_3)
+ vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
+ vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
+ vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
+ vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_3)
+
+SNIP_START(_N_ColorMatrix_store_f32_2)
+ vst2.32 {d0[0],d2[0]}, [r0]!
+ vst2.32 {d0[1],d2[1]}, [r0]!
+ vst2.32 {d1[0],d3[0]}, [r0]!
+ vst2.32 {d1[1],d3[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_2)
+
+SNIP_START(_N_ColorMatrix_store_f32_1)
+ vst1.32 {q0}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_1)
+
+
+SNIP_START(_N_ColorMatrix_unpack_u8_4)
+ vmovl.u8 q12, d0 /* R */
+ vmovl.u8 q13, d1 /* G */
+ vmovl.u8 q14, d2 /* B */
+ vmovl.u8 q15, d3 /* A */
+SNIP_END(_N_ColorMatrix_unpack_u8_4)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_3)
+ vmovl.u8 q12, d0 /* R */
+ vmovl.u8 q13, d1 /* G */
+ vmovl.u8 q14, d2 /* B */
+ veor q15, q15
+SNIP_END(_N_ColorMatrix_unpack_u8_3)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_2)
+ vmovl.u8 q12, d0 /* R */
+ vmovl.u8 q13, d1 /* G */
+ veor q14, q14
+ veor q15, q15
+SNIP_END(_N_ColorMatrix_unpack_u8_2)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_1)
+ vmovl.u8 q12, d0 /* R */
+ veor q13, q13
+ veor q14, q14
+ veor q15, q15
+SNIP_END(_N_ColorMatrix_unpack_u8_1)
+
+SNIP_START(_N_ColorMatrix_pack_u8_4)
+ vqrshrn.s32 d24, q8, #8
+ vqrshrn.s32 d26, q9, #8
+ vqrshrn.s32 d28, q10, #8
+ vqrshrn.s32 d30, q11, #8
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+SNIP_END(_N_ColorMatrix_pack_u8_4)
+
+SNIP_START(_N_ColorMatrix_pack_u8_3)
+ vqrshrn.s32 d24, q8, #8
+ vqrshrn.s32 d26, q9, #8
+ vqrshrn.s32 d28, q10, #8
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vqmovun.s16 d2, q14
+SNIP_END(_N_ColorMatrix_pack_u8_3)
+
+SNIP_START(_N_ColorMatrix_pack_u8_2)
+ vqrshrn.s32 d24, q8, #8
+ vqrshrn.s32 d26, q9, #8
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+SNIP_END(_N_ColorMatrix_pack_u8_2)
+
+SNIP_START(_N_ColorMatrix_pack_u8_1)
+ vqrshrn.s32 d24, q8, #8
+ vqmovun.s16 d0, q12
+SNIP_END(_N_ColorMatrix_pack_u8_1)
+
+SNIP_START(_N_ColorMatrix_dot)
+ vmov.u8 d1, d0
+ vmov.u8 d2, d0
+SNIP_END(_N_ColorMatrix_dot)
+
diff --git a/toolkit/Convolve3x3.cpp b/toolkit/Convolve3x3.cpp
new file mode 100644
index 0000000..51339a2
--- /dev/null
+++ b/toolkit/Convolve3x3.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.Convolve3x3"
+
+namespace android {
+namespace renderscript {
+
+extern "C" void rsdIntrinsicConvolve3x3_K(void* dst, const void* y0, const void* y1, const void* y2,
+ const int16_t* coef, uint32_t count);
+
+class Convolve3x3Task : public Task {
+ const void* mIn;
+ void* mOut;
+ // Even though we have exactly 9 coefficients, store them in an array of size 16 so that
+ // the SIMD instructions can load them in chunks multiple of 8.
+ float mFp[16];
+ int16_t mIp[16];
+
+ void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
+ const uchar* py2);
+ void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+ size_t startX, size_t startY, size_t endX, size_t endY);
+
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ public:
+ Convolve3x3Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
+ const float* coefficients, const Restriction* restriction)
+ : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
+ for (int ct = 0; ct < 9; ct++) {
+ mFp[ct] = coefficients[ct];
+ if (mFp[ct] >= 0) {
+ mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
+ } else {
+ mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
+ }
+ }
+ }
+};
+
+/**
+ * Computes one convolution and stores the result in the output. This is used for uchar, uchar2,
+ * uchar3, and uchar4 vectors.
+ *
+ * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. uchar4.
+ * @tparam ComputationType Type we use for the intermediate computations.
+ * @param x The index in the row of the value we'll convolve.
+ * @param out The location in the output array where we store the value.
+ * @param py0 The start of the top row.
+ * @param py1 The start of the middle row.
+ * @param py2 The start of the bottom row.
+ * @param coeff Pointer to the float coefficients, in row major format.
+ * @param sizeX The number of cells of one row.
+ */
+template <typename InputOutputType, typename ComputationType>
+static void convolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+ const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
+ int32_t sizeX) {
+ uint32_t x1 = std::max((int32_t)x - 1, 0);
+ uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
+
+ ComputationType px = convert<ComputationType>(py0[x1]) * coeff[0] +
+ convert<ComputationType>(py0[x]) * coeff[1] +
+ convert<ComputationType>(py0[x2]) * coeff[2] +
+ convert<ComputationType>(py1[x1]) * coeff[3] +
+ convert<ComputationType>(py1[x]) * coeff[4] +
+ convert<ComputationType>(py1[x2]) * coeff[5] +
+ convert<ComputationType>(py2[x1]) * coeff[6] +
+ convert<ComputationType>(py2[x]) * coeff[7] +
+ convert<ComputationType>(py2[x2]) * coeff[8];
+
+ px = clamp(px + 0.5f, 0.f, 255.f);
+ *out = convert<InputOutputType>(px);
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+/**
+ * Computes one convolution and stores the result in the output. This is used for float, float2,
+ * float3, and float4 vectors.
+ *
+ * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. float4.
+ * @param x The index in the row of the value we'll convolve.
+ * @param out The location in the output array where we store the value.
+ * @param py0 The start of the top row.
+ * @param py1 The start of the middle row.
+ * @param py2 The start of the bottom row.
+ * @param coeff Pointer to the float coefficients, in row major format.
+ * @param sizeX The number of cells of one row.
+ */
+template <typename InputOutputType>
+static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+ const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
+ int32_t sizeX) {
+ uint32_t x1 = std::max((int32_t)x - 1, 0);
+ uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
+ *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
+ (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
+ (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
+}
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+/**
+ * This function convolves one line.
+ *
+ * @param pout Where to place the next output.
+ * @param xstart Index in the X direction of where to start.
+ * @param xend End index
+ * @param ppy0 Points to the start of the previous line.
+ * @param ppy1 Points to the start of the current line.
+ * @param ppy2 Points to the start of the next line.
+ */
+void Convolve3x3Task::kernelU4(uchar* pout, uint32_t xstart, uint32_t xend, const uchar* ppy0,
+ const uchar* ppy1, const uchar* ppy2) {
+ uchar4* out = (uchar4*)pout;
+ const uchar4* py0 = (const uchar4*)ppy0;
+ const uchar4* py1 = (const uchar4*)ppy1;
+ const uchar4* py2 = (const uchar4*)ppy2;
+
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+ if (x1 == 0) {
+ convolveOneU<uchar4, float4>(0, out, py0, py1, py2, mFp, mSizeX);
+ x1++;
+ out++;
+ }
+
+ if (x2 > x1) {
+#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
+ if (mUsesSimd) {
+ int32_t len = (x2 - x1 - 1) >> 1;
+ if (len > 0) {
+ rsdIntrinsicConvolve3x3_K(out, &py0[x1 - 1], &py1[x1 - 1], &py2[x1 - 1], mIp, len);
+ x1 += len << 1;
+ out += len << 1;
+ }
+ }
+#endif
+
+ while (x1 != x2) {
+ convolveOneU<uchar4, float4>(x1, out, py0, py1, py2, mFp, mSizeX);
+ out++;
+ x1++;
+ }
+ }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+template <typename T>
+void RsdCpuScriptIntrinsicConvolve3x3_kernelF(void* in, T* out, uint32_t xstart, uint32_t xend,
+ uint32_t currentY, size_t sizeX, size_t sizeY,
+ size_t vectorSize, float* fp) {
+ const uchar* pin = (const uchar*)in;
+ const size_t stride = sizeX * vectorSize * 4; // float takes 4 bytes
+
+ uint32_t y1 = std::min((int32_t)currentY + 1, (int32_t)(sizeY - 1));
+ uint32_t y2 = std::max((int32_t)currentY - 1, 0);
+ const T* py0 = (const T*)(pin + stride * y2);
+ const T* py1 = (const T*)(pin + stride * currentY);
+ const T* py2 = (const T*)(pin + stride * y1);
+
+ for (uint32_t x = xstart; x < xend; x++, out++) {
+ ConvolveOneF<T>(x, out, py0, py1, py2, fp, sizeX);
+ }
+}
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+template <typename InputOutputType, typename ComputationType>
+static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+ size_t startX, size_t startY, size_t endX, size_t endY, float* fp) {
+ const size_t stride = vectorSize * sizeX;
+ for (size_t y = startY; y < endY; y++) {
+ uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+ uint32_t y2 = std::max((int32_t)y - 1, 0);
+
+ size_t offset = (y * sizeX + startX) * vectorSize;
+ InputOutputType* px = (InputOutputType*)(pout + offset);
+ InputOutputType* py0 = (InputOutputType*)(pin + stride * y2);
+ InputOutputType* py1 = (InputOutputType*)(pin + stride * y);
+ InputOutputType* py2 = (InputOutputType*)(pin + stride * y1);
+ for (uint32_t x = startX; x < endX; x++, px++) {
+ convolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, fp, sizeX);
+ }
+ }
+}
+
+void Convolve3x3Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
+ size_t sizeY, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ const size_t stride = paddedSize(vectorSize) * sizeX;
+ for (size_t y = startY; y < endY; y++) {
+ uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+ uint32_t y2 = std::max((int32_t)y - 1, 0);
+
+ size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
+ uchar* px = pout + offset;
+ const uchar* py0 = pin + stride * y2;
+ const uchar* py1 = pin + stride * y;
+ const uchar* py2 = pin + stride * y1;
+ kernelU4(px, startX, endX, py0, py1, py2);
+ }
+}
+
+void Convolve3x3Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
+ // endX, endY);
+ switch (mVectorSize) {
+ case 1:
+ convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+ startX, startY, endX, endY, mFp);
+ break;
+ case 2:
+ convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+ startX, startY, endX, endY, mFp);
+ break;
+ case 3:
+ case 4:
+ convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
+ endX, endY);
+ break;
+ }
+}
+
+void RenderScriptToolkit::convolve3x3(const void* in, void* out, size_t vectorSize, size_t sizeX,
+ size_t sizeY, const float* coefficients,
+ const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+ return;
+ }
+ if (vectorSize < 1 || vectorSize > 4) {
+ ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+ return;
+ }
+#endif
+
+ Convolve3x3Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
+ processor->doTask(&task);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/Convolve5x5.cpp b/toolkit/Convolve5x5.cpp
new file mode 100644
index 0000000..1f3f75c
--- /dev/null
+++ b/toolkit/Convolve5x5.cpp
@@ -0,0 +1,350 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Convolve5x5"
+
+extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2,
+ const void* y3, const void* y4, const int16_t* coef,
+ uint32_t count);
+
+class Convolve5x5Task : public Task {
+ const void* mIn;
+ void* mOut;
+ // Even though we have exactly 25 coefficients, store them in an array of size 28 so that
+ // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4.
+ float mFp[28];
+ int16_t mIp[28];
+
+ void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
+ const uchar* py2, const uchar* py3, const uchar* py4);
+ void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+ size_t startX, size_t startY, size_t endX, size_t endY);
+
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ public:
+ Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
+ const float* coefficients, const Restriction* restriction)
+ : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
+ for (int ct = 0; ct < 25; ct++) {
+ mFp[ct] = coefficients[ct];
+ if (mFp[ct] >= 0) {
+ mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
+ } else {
+ mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
+ }
+ }
+ }
+};
+
+template <typename InputOutputType, typename ComputationType>
+static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+ const InputOutputType* py1, const InputOutputType* py2,
+ const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
+ int32_t width) {
+ uint32_t x0 = std::max((int32_t)x - 2, 0);
+ uint32_t x1 = std::max((int32_t)x - 1, 0);
+ uint32_t x2 = x;
+ uint32_t x3 = std::min((int32_t)x + 1, width - 1);
+ uint32_t x4 = std::min((int32_t)x + 2, width - 1);
+
+ ComputationType px = convert<ComputationType>(py0[x0]) * coeff[0] +
+ convert<ComputationType>(py0[x1]) * coeff[1] +
+ convert<ComputationType>(py0[x2]) * coeff[2] +
+ convert<ComputationType>(py0[x3]) * coeff[3] +
+ convert<ComputationType>(py0[x4]) * coeff[4] +
+
+ convert<ComputationType>(py1[x0]) * coeff[5] +
+ convert<ComputationType>(py1[x1]) * coeff[6] +
+ convert<ComputationType>(py1[x2]) * coeff[7] +
+ convert<ComputationType>(py1[x3]) * coeff[8] +
+ convert<ComputationType>(py1[x4]) * coeff[9] +
+
+ convert<ComputationType>(py2[x0]) * coeff[10] +
+ convert<ComputationType>(py2[x1]) * coeff[11] +
+ convert<ComputationType>(py2[x2]) * coeff[12] +
+ convert<ComputationType>(py2[x3]) * coeff[13] +
+ convert<ComputationType>(py2[x4]) * coeff[14] +
+
+ convert<ComputationType>(py3[x0]) * coeff[15] +
+ convert<ComputationType>(py3[x1]) * coeff[16] +
+ convert<ComputationType>(py3[x2]) * coeff[17] +
+ convert<ComputationType>(py3[x3]) * coeff[18] +
+ convert<ComputationType>(py3[x4]) * coeff[19] +
+
+ convert<ComputationType>(py4[x0]) * coeff[20] +
+ convert<ComputationType>(py4[x1]) * coeff[21] +
+ convert<ComputationType>(py4[x2]) * coeff[22] +
+ convert<ComputationType>(py4[x3]) * coeff[23] +
+ convert<ComputationType>(py4[x4]) * coeff[24];
+ px = clamp(px + 0.5f, 0.f, 255.f);
+ *out = convert<InputOutputType>(px);
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+template <typename InputOutputType>
+static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+ const InputOutputType* py1, const InputOutputType* py2,
+ const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
+ int32_t width) {
+ uint32_t x0 = std::max((int32_t)x - 2, 0);
+ uint32_t x1 = std::max((int32_t)x - 1, 0);
+ uint32_t x2 = x;
+ uint32_t x3 = std::min((int32_t)x + 1, width - 1);
+ uint32_t x4 = std::min((int32_t)x + 2, width - 1);
+
+ InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] +
+ py0[x3] * coeff[3] + py0[x4] * coeff[4] +
+
+ py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] +
+ py1[x3] * coeff[8] + py1[x4] * coeff[9] +
+
+ py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] +
+ py2[x3] * coeff[13] + py2[x4] * coeff[14] +
+
+ py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] +
+ py3[x3] * coeff[18] + py3[x4] * coeff[19] +
+
+ py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] +
+ py4[x3] * coeff[23] + py4[x4] * coeff[24];
+ *out = px;
+}
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+/**
+ * This function convolves one line.
+ *
+ * @param pout Where to place the next output.
+ * @param xstart Index in the X direction of where to start.
+ * @param xend End index
+ * @param ppy0 Points to the start of the line two above.
+ * @param ppy1 Points to the start of the line one above.
+ * @param ppy2 Points to the start of the current line.
+ * @param ppy3 Points to the start of the line one below.
+ * @param ppy4 Points to the start of the line two below.
+ */
+void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
+ const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
+ const uchar* ppy4) {
+ uchar4* out = (uchar4*)pout;
+ const uchar4* py0 = (const uchar4*)ppy0;
+ const uchar4* py1 = (const uchar4*)ppy1;
+ const uchar4* py2 = (const uchar4*)ppy2;
+ const uchar4* py3 = (const uchar4*)ppy3;
+ const uchar4* py4 = (const uchar4*)ppy4;
+
+ while ((x1 < x2) && (x1 < 2)) {
+ ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
+ out++;
+ x1++;
+ }
+#if defined(ARCH_X86_HAVE_SSSE3)
+ // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
+ // 3 for end boundary where x may hit the end boundary)
+ if (mUsesSimd && ((x1 + 6) < x2)) {
+ // subtract 3 for end boundary
+ uint32_t len = (x2 - x1 - 3) >> 2;
+ rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
+ py4 + x1 - 2, mIp, len);
+ out += len << 2;
+ x1 += len << 2;
+ }
+#endif
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+ if (mUsesSimd && ((x1 + 3) < x2)) {
+ uint32_t len = (x2 - x1 - 3) >> 1;
+ rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
+ py4 + x1 - 2, mIp, len);
+ out += len << 1;
+ x1 += len << 1;
+ }
+#endif
+
+ while (x1 < x2) {
+ ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
+ out++;
+ x1++;
+ }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+// This will need more cleanup before it can be used.
+void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out,
+ uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ const uchar* pin = (const uchar*)info->in;
+ const size_t stride = info->stride;
+
+ uint32_t y0 = std::max((int32_t)currentY - 2, 0);
+ uint32_t y1 = std::max((int32_t)currentY - 1, 0);
+ uint32_t y2 = currentY;
+ uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
+ uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
+
+ const float4* py0 = (const float4*)(pin + stride * y0);
+ const float4* py1 = (const float4*)(pin + stride * y1);
+ const float4* py2 = (const float4*)(pin + stride * y2);
+ const float4* py3 = (const float4*)(pin + stride * y3);
+ const float4* py4 = (const float4*)(pin + stride * y4);
+
+ for (uint32_t x = xstart; x < xend; x++, out++) {
+ ConvolveOneF<float4>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
+ }
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out,
+ uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ const uchar* pin = (const uchar*)info->in;
+ const size_t stride = info->stride;
+
+ uint32_t y0 = std::max((int32_t)currentY - 2, 0);
+ uint32_t y1 = std::max((int32_t)currentY - 1, 0);
+ uint32_t y2 = currentY;
+ uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
+ uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
+
+ const float2* py0 = (const float2*)(pin + stride * y0);
+ const float2* py1 = (const float2*)(pin + stride * y1);
+ const float2* py2 = (const float2*)(pin + stride * y2);
+ const float2* py3 = (const float2*)(pin + stride * y3);
+ const float2* py4 = (const float2*)(pin + stride * y4);
+
+ for (uint32_t x = xstart; x < xend; x++, out++) {
+ ConvolveOneF<float2>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
+ }
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out,
+ uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ const uchar* pin = (const uchar*)info->in;
+ const size_t stride = info->stride;
+
+ uint32_t y0 = std::max((int32_t)currentY - 2, 0);
+ uint32_t y1 = std::max((int32_t)currentY - 1, 0);
+ uint32_t y2 = currentY;
+ uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
+ uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
+
+ const float* py0 = (const float*)(pin + stride * y0);
+ const float* py1 = (const float*)(pin + stride * y1);
+ const float* py2 = (const float*)(pin + stride * y2);
+ const float* py3 = (const float*)(pin + stride * y3);
+ const float* py4 = (const float*)(pin + stride * y4);
+
+ for (uint32_t x = xstart; x < xend; x++, out++) {
+ ConvolveOneF<float>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
+ }
+}
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+template <typename InputOutputType, typename ComputationType>
+static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+ size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) {
+ const size_t stride = vectorSize * sizeX;
+ for (size_t y = startY; y < endY; y++) {
+ uint32_t y0 = std::max((int32_t)y - 2, 0);
+ uint32_t y1 = std::max((int32_t)y - 1, 0);
+ uint32_t y2 = y;
+ uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+ uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
+
+ size_t offset = (y * sizeX + startX) * vectorSize;
+ InputOutputType* px = (InputOutputType*)(pout + offset);
+ InputOutputType* py0 = (InputOutputType*)(pin + stride * y0);
+ InputOutputType* py1 = (InputOutputType*)(pin + stride * y1);
+ InputOutputType* py2 = (InputOutputType*)(pin + stride * y2);
+ InputOutputType* py3 = (InputOutputType*)(pin + stride * y3);
+ InputOutputType* py4 = (InputOutputType*)(pin + stride * y4);
+ for (uint32_t x = startX; x < endX; x++, px++) {
+ ConvolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, py3, py4, mFp,
+ sizeX);
+ }
+ }
+}
+
+void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
+ size_t sizeY, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ const size_t stride = paddedSize(vectorSize) * sizeX;
+ for (size_t y = startY; y < endY; y++) {
+ uint32_t y0 = std::max((int32_t)y - 2, 0);
+ uint32_t y1 = std::max((int32_t)y - 1, 0);
+ uint32_t y2 = y;
+ uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+ uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
+
+ size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
+ uchar* px = pout + offset;
+ const uchar* py0 = pin + stride * y0;
+ const uchar* py1 = pin + stride * y1;
+ const uchar* py2 = pin + stride * y2;
+ const uchar* py3 = pin + stride * y3;
+ const uchar* py4 = pin + stride * y4;
+ kernelU4(px, startX, endX, py0, py1, py2, py3, py4);
+ }
+}
+
+void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
+ // endX, endY);
+ switch (mVectorSize) {
+ case 1:
+ convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+ startX, startY, endX, endY, mFp);
+ break;
+ case 2:
+ convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+ startX, startY, endX, endY, mFp);
+ break;
+ case 3:
+ case 4:
+ convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
+ endX, endY);
+ break;
+ }
+}
+
+void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX,
+ size_t sizeY, const float* coefficients,
+ const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+ return;
+ }
+ if (vectorSize < 1 || vectorSize > 4) {
+ ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+ return;
+ }
+#endif
+
+ Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
+ processor->doTask(&task);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/Convolve_advsimd.S b/toolkit/Convolve_advsimd.S
new file mode 100644
index 0000000..0daa0c5
--- /dev/null
+++ b/toolkit/Convolve_advsimd.S
@@ -0,0 +1,265 @@
+/*
+ * Copyright (C) 2012,2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ x0 = dst
+ x1 = y0 base pointer
+ x2 = y1 base pointer
+ x3 = y2 base pointer
+ x4 = coeffs
+ x5 = length / 2
+*/
+
+#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+ sub x6, sp, #64
+ sub sp, sp, #64
+ st1 {v8.1d-v11.1d}, [x6], #32
+ st1 {v12.1d-v15.1d}, [x6]
+
+ /* Load the coefficients in the v0, v1 registers */
+ ld1 {v0.8h, v1.8h}, [x4]
+
+ /* Load the frequently used immediate in a register */
+ mov x4, #8
+
+1:
+ /* Load and post-increase the address by x4=#8 */
+ ld1 {v13.16b}, [x1], x4
+ ld1 {v14.16b}, [x2], x4
+ ld1 {v15.16b}, [x3], x4
+
+ /* Signal memory for data that will be used in the loop after the next */
+// prfm PLDL1KEEP,[x1, x4] // TODO: test this
+// prfm PLDL1KEEP,[x2, x4] // TODO: test this
+// prfm PLDL1KEEP,[x3, x4] // TODO: test this
+
+ uxtl v2.8h, v13.8b
+ uxtl2 v3.8h, v13.16b
+ uxtl v4.8h, v14.8b
+ uxtl2 v5.8h, v14.16b
+ uxtl v6.8h, v15.8b
+ uxtl2 v7.8h, v15.16b
+
+/*
+ The two pixel source array is
+ v2, v2hi, v3lo, v3hi
+ v4, v4hi, v5lo, v5hi
+ v6, v6hi, v7lo, v7hi
+*/
+
+ smull v8.4s, v2.4h, v0.h[0]
+ smull2 v9.4s, v2.8h, v0.h[0]
+ smlal2 v8.4s, v2.8h, v0.h[1]
+ smlal v9.4s, v3.4h, v0.h[1]
+ smlal v8.4s, v3.4h, v0.h[2]
+ smlal2 v9.4s, v3.8h, v0.h[2]
+ smlal v8.4s, v4.4h, v0.h[3]
+ smlal2 v9.4s, v4.8h, v0.h[3]
+ smlal2 v8.4s, v4.8h, v0.h[4]
+ smlal v9.4s, v5.4h, v0.h[4]
+ smlal v8.4s, v5.4h, v0.h[5]
+ smlal2 v9.4s, v5.8h, v0.h[5]
+ smlal v8.4s, v6.4h, v0.h[6]
+ smlal2 v9.4s, v6.8h, v0.h[6]
+ smlal2 v8.4s, v6.8h, v0.h[7]
+ smlal v9.4s, v7.4h, v0.h[7]
+ smlal v8.4s, v7.4h, v1.h[0]
+ smlal2 v9.4s, v7.8h, v1.h[0]
+
+ shrn v8.4h, v8.4s, #8
+ shrn2 v8.8h, v9.4s, #8
+
+ sqxtun v8.8b, v8.8h
+ st1 {v8.8b}, [x0], #8
+
+ /* Are we done yet? */
+ subs x5, x5, #1
+ bne 1b
+
+ /* We're done, bye! */
+ ld1 {v8.1d-v11.1d}, [sp], #32
+ ld1 {v12.1d-v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicConvolve3x3_K)
+
+
+/* Convolve 5x5 */
+
+/*
+ x0 = dst
+ x1 = y0 base pointer
+ x2 = y1 base pointer
+ x3 = y2 base pointer
+ x4 = y3 base pointer
+ x5 = y4 base pointer
+ x6 = coeffs
+ x7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+ sub x8, sp, #64
+ sub sp, sp, #64
+ st1 {v8.1d-v11.1d}, [x8], #32
+ st1 {v12.1d-v15.1d}, [x8]
+
+ /* Create the coefficients vector */
+ ld1 {v0.8h-v2.8h}, [x6], #48
+ ld1 {v3.4h}, [x6], #8
+
+ movi v15.4s, #0x7f
+
+ /* Load the frequently used immediate in a register */
+ mov x6, #8
+
+1:
+ /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+ ld1 {v9.8b-v11.8b}, [x1], x6 // y0 ( y - 2 )
+ ld1 {v12.8b-v14.8b}, [x2], x6 // y0 ( y - 1 )
+
+ /* Signal memory for data that will be used in the loop after the next */
+// prfm PLDL1KEEP,[x1, x6] // TODO: test this
+// prfm PLDL1KEEP,[x2, x6] // TODO: test this
+
+ /* Promoting the 8bit channels to 16bit */
+ uxtl v9.8h, v9.8b
+ uxtl v10.8h, v10.8b
+ uxtl v11.8h, v11.8b
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+
+/*
+ v9, v9hi, v10lo, v10hi, v11lo, v11hi,
+ v12, v12hi
+*/
+ smull v4.4s, v9.4h, v0.h[0]
+ smull2 v5.4s, v9.8h, v0.h[0]
+ smlal2 v4.4s, v9.8h, v0.h[1]
+ smlal v5.4s, v10.4h, v0.h[1]
+ smlal v4.4s, v10.4h, v0.h[2]
+ smlal2 v5.4s, v10.8h, v0.h[2]
+ smlal2 v4.4s, v10.8h, v0.h[3]
+ smlal v5.4s, v11.4h, v0.h[3]
+ smlal v4.4s, v11.4h, v0.h[4]
+ smlal2 v5.4s, v11.8h, v0.h[4]
+
+ smlal v4.4s, v12.4h, v0.h[5]
+ smlal2 v5.4s, v12.8h, v0.h[5]
+ smlal2 v4.4s, v12.8h, v0.h[6]
+ smlal v5.4s, v13.4h, v0.h[6]
+ smlal v4.4s, v13.4h, v0.h[7]
+ smlal2 v5.4s, v13.8h, v0.h[7]
+ smlal2 v4.4s, v13.8h, v1.h[0]
+ smlal v5.4s, v14.4h, v1.h[0]
+ smlal v4.4s, v14.4h, v1.h[1]
+ smlal2 v5.4s, v14.8h, v1.h[1]
+
+ /* Next 2 rows */
+ /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+ ld1 {v9.8b-v11.8b}, [x3], x6 // y0 ( y )
+ ld1 {v12.8b-v14.8b}, [x4], x6 // y0 ( y + 1 )
+
+ /* Signal memory for data that will be used in the loop after the next */
+// prfm PLDL1KEEP,[x3, x6] // TODO: test this
+// prfm PLDL1KEEP,[x4, x6] // TODO: test this
+
+ /* Promoting the 8bit channels to 16bit */
+ uxtl v9.8h, v9.8b
+ uxtl v10.8h, v10.8b
+ uxtl v11.8h, v11.8b
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+
+/*
+ v9, v9hi, v10lo, v10hi, v11lo, v11hi,
+ v12, v12hi
+*/
+ smlal v4.4s, v9.4h, v1.h[2]
+ smlal2 v5.4s, v9.8h, v1.h[2]
+ smlal2 v4.4s, v9.8h, v1.h[3]
+ smlal v5.4s, v10.4h, v1.h[3]
+ smlal v4.4s, v10.4h, v1.h[4]
+ smlal2 v5.4s, v10.8h, v1.h[4]
+ smlal2 v4.4s, v10.8h, v1.h[5]
+ smlal v5.4s, v11.4h, v1.h[5]
+ smlal v4.4s, v11.4h, v1.h[6]
+ smlal2 v5.4s, v11.8h, v1.h[6]
+
+ smlal v4.4s, v12.4h, v1.h[7]
+ smlal2 v5.4s, v12.8h, v1.h[7]
+ smlal2 v4.4s, v12.8h, v2.h[0]
+ smlal v5.4s, v13.4h, v2.h[0]
+ smlal v4.4s, v13.4h, v2.h[1]
+ smlal2 v5.4s, v13.8h, v2.h[1]
+ smlal2 v4.4s, v13.8h, v2.h[2]
+ smlal v5.4s, v14.4h, v2.h[2]
+ smlal v4.4s, v14.4h, v2.h[3]
+ smlal2 v5.4s, v14.8h, v2.h[3]
+
+ /* Last row */
+ /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+ ld1 {v9.8b- v11.8b}, [x5], x6 // y0 ( y + 2 )
+
+ /* Signal memory for data that will be used in the loop after the next */
+// prfm PLDL1KEEP,[x5, x6] // TODO: test this
+
+ /* Promoting the 8bit channels to 16bit */
+ uxtl v9.8h, v9.8b
+ uxtl v10.8h, v10.8b
+ uxtl v11.8h, v11.8b
+
+/*
+ v9, v9hi, v10lo, v10hi, v11lo, v11hi,
+ v12, v12hi
+*/
+
+ smlal v4.4s, v9.4h, v2.h[4]
+ smlal2 v5.4s, v9.8h, v2.h[4]
+ smlal2 v4.4s, v9.8h, v2.h[5]
+ smlal v5.4s, v10.4h, v2.h[5]
+ smlal v4.4s, v10.4h, v2.h[6]
+ smlal2 v5.4s, v10.8h, v2.h[6]
+ smlal2 v4.4s, v10.8h, v2.h[7]
+ smlal v5.4s, v11.4h, v2.h[7]
+ smlal v4.4s, v11.4h, v3.h[0]
+ smlal2 v5.4s, v11.8h, v3.h[0]
+
+ add v4.4s, v4.4s, v15.4s
+ add v5.4s, v5.4s, v15.4s
+
+/* Narrow it to a d-reg 32 -> 16 bit */
+ rshrn v4.4h, v4.4s, #8
+ rshrn2 v4.8h, v5.4s, #8
+
+
+/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+ sqxtun v4.8b, v4.8h
+
+ st1 {v4.8b}, [x0], #8 // return the output and increase the address of x0
+
+ /* Are we done? */
+ subs x7, x7, #1
+ bne 1b
+
+ /* Yup, bye */
+ ld1 {v8.1d-v11.1d}, [sp], #32
+ ld1 {v12.1d-v15.1d}, [sp], #32
+ ret
+
+END(rsdIntrinsicConvolve5x5_K)
diff --git a/toolkit/Convolve_neon.S b/toolkit/Convolve_neon.S
new file mode 100644
index 0000000..ee10884
--- /dev/null
+++ b/toolkit/Convolve_neon.S
@@ -0,0 +1,287 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ r0 = dst
+ r1 = y0 base pointer
+ r2 = y1 base pointer
+ r3 = y2 base pointer
+ sp = coeffs
+ sp = length / 2
+*/
+
+#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+ push {r4-r8, r10, r11, lr}
+ vpush {q4-q7}
+
+ /* Get the coeffs pointer from the stack and load the
+ coefficients in the q0, q1 NEON registers */
+ ldr r4, [sp, #32+64]
+ vld1.16 {q0, q1}, [r4]
+
+ /* Get count from the stack */
+ ldr r4, [sp, #36+64]
+
+ /* Load the frequently used immediate in a register */
+ mov r5, #8
+
+1:
+ /* Load and post-increase the address by r5=#8 */
+ vld1.8 {q13}, [r1], r5
+ vld1.8 {q14}, [r2], r5
+ vld1.8 {q15}, [r3], r5
+
+ /* Signal memory for data that will be used in the loop after the next */
+ pld [r1, r5]
+ pld [r2, r5]
+ pld [r3, r5]
+
+ vmovl.u8 q2, d26
+ vmovl.u8 q3, d27
+ vmovl.u8 q4, d28
+ vmovl.u8 q5, d29
+ vmovl.u8 q6, d30
+ vmovl.u8 q7, d31
+
+/*
+ The two pixel source array is
+ d4, d5, d6, d7
+ d8, d9, d10, d11
+ d12, d13, d14, d15
+*/
+
+ vmull.s16 q8, d4, d0[0]
+ vmlal.s16 q8, d5, d0[1]
+ vmlal.s16 q8, d6, d0[2]
+ vmlal.s16 q8, d8, d0[3]
+ vmlal.s16 q8, d9, d1[0]
+ vmlal.s16 q8, d10, d1[1]
+ vmlal.s16 q8, d12, d1[2]
+ vmlal.s16 q8, d13, d1[3]
+ vmlal.s16 q8, d14, d2[0]
+
+ vmull.s16 q9, d5, d0[0]
+ vmlal.s16 q9, d6, d0[1]
+ vmlal.s16 q9, d7, d0[2]
+ vmlal.s16 q9, d9, d0[3]
+ vmlal.s16 q9, d10, d1[0]
+ vmlal.s16 q9, d11, d1[1]
+ vmlal.s16 q9, d13, d1[2]
+ vmlal.s16 q9, d14, d1[3]
+ vmlal.s16 q9, d15, d2[0]
+
+ vshrn.i32 d16, q8, #8
+ vshrn.i32 d17, q9, #8
+
+ vqmovun.s16 d16, q8
+ vst1.8 d16, [r0]!
+
+ /* Are we done yet? */
+ subs r4, r4, #1
+ bne 1b
+
+ /* We're done, bye! */
+ vpop {q4-q7}
+ pop {r4-r8, r10, r11, lr}
+ bx lr
+END(rsdIntrinsicConvolve3x3_K)
+
+
+/* Convolve 5x5 */
+
+/*
+ r0 = dst
+ r1 = y0 base pointer
+ r2 = y1 base pointer
+ r3 = y2 base pointer
+ r4 = y3 base pointer
+ r5 = y4 base pointer
+ r6 = coeffs
+ r7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+ push {r4-r7, lr}
+ vpush {q4-q7}
+
+ /* load y3 in r4 */
+ ldr r4, [sp, #20 + 64]
+
+ /* load y4 in r5 */
+ ldr r5, [sp, #24 + 64]
+
+ /* Load the coefficients pointer */
+ ldr r6, [sp, #28 + 64]
+
+ /* Create the coefficients vector */
+ vld1.16 {d0, d1, d2, d3}, [r6]!
+ vld1.16 {d4, d5, d6}, [r6]
+
+ vmov.u32 q15, #0x7f
+
+ /* load the count */
+ ldr r6, [sp, #32 + 64]
+
+ /* Load the frequently used immediate in a register */
+ mov r7, #8
+
+1:
+ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+ vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 )
+ vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 )
+
+ /* Signal memory for data that will be used in the loop after the next */
+ pld [r1, r7]
+ pld [r2, r7]
+
+ /* Promoting the 8bit channels to 16bit */
+ vmovl.u8 q9, d24
+ vmovl.u8 q10, d25
+ vmovl.u8 q11, d26
+ vmovl.u8 q12, d27
+ vmovl.u8 q13, d28
+ vmovl.u8 q14, d29
+
+/*
+ d18, d19, d20, d21, d22, d23,
+ d24, d25
+*/
+ vmull.s16 q4, d18, d0[0]
+ vmlal.s16 q4, d19, d0[1]
+ vmlal.s16 q4, d20, d0[2]
+ vmlal.s16 q4, d21, d0[3]
+ vmlal.s16 q4, d22, d1[0]
+
+ vmlal.s16 q4, d24, d1[1]
+ vmlal.s16 q4, d25, d1[2]
+ vmlal.s16 q4, d26, d1[3]
+ vmlal.s16 q4, d27, d2[0]
+ vmlal.s16 q4, d28, d2[1]
+
+ vmull.s16 q5, d19, d0[0]
+ vmlal.s16 q5, d20, d0[1]
+ vmlal.s16 q5, d21, d0[2]
+ vmlal.s16 q5, d22, d0[3]
+ vmlal.s16 q5, d23, d1[0]
+
+ vmlal.s16 q5, d25, d1[1]
+ vmlal.s16 q5, d26, d1[2]
+ vmlal.s16 q5, d27, d1[3]
+ vmlal.s16 q5, d28, d2[0]
+ vmlal.s16 q5, d29, d2[1]
+
+
+ /* Next 2 rows */
+ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+ vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y )
+ vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 )
+
+ /* Signal memory for data that will be used in the loop after the next */
+ pld [r3, r7]
+ pld [r4, r7]
+
+ /* Promoting the 8bit channels to 16bit */
+ vmovl.u8 q9, d24
+ vmovl.u8 q10, d25
+ vmovl.u8 q11, d26
+ vmovl.u8 q12, d27
+ vmovl.u8 q13, d28
+ vmovl.u8 q14, d29
+
+/*
+ d18, d19, d20, d21, d22, d23,
+ d24, d25
+*/
+ vmlal.s16 q4, d18, d2[2]
+ vmlal.s16 q4, d19, d2[3]
+ vmlal.s16 q4, d20, d3[0]
+ vmlal.s16 q4, d21, d3[1]
+ vmlal.s16 q4, d22, d3[2]
+
+ vmlal.s16 q4, d24, d3[3]
+ vmlal.s16 q4, d25, d4[0]
+ vmlal.s16 q4, d26, d4[1]
+ vmlal.s16 q4, d27, d4[2]
+ vmlal.s16 q4, d28, d4[3]
+
+ vmlal.s16 q5, d19, d2[2]
+ vmlal.s16 q5, d20, d2[3]
+ vmlal.s16 q5, d21, d3[0]
+ vmlal.s16 q5, d22, d3[1]
+ vmlal.s16 q5, d23, d3[2]
+
+ vmlal.s16 q5, d25, d3[3]
+ vmlal.s16 q5, d26, d4[0]
+ vmlal.s16 q5, d27, d4[1]
+ vmlal.s16 q5, d28, d4[2]
+ vmlal.s16 q5, d29, d4[3]
+
+ /* Last row */
+ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+ vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 )
+
+ /* Signal memory for data that will be used in the loop after the next */
+ pld [r5, r7]
+
+ /* Promoting the 8bit channels to 16bit */
+ vmovl.u8 q9, d24
+ vmovl.u8 q10, d25
+ vmovl.u8 q11, d26
+
+/*
+ d18, d19, d20, d21, d22, d23,
+ d24, d25
+*/
+
+ vmlal.s16 q4, d18, d5[0]
+ vmlal.s16 q4, d19, d5[1]
+ vmlal.s16 q4, d20, d5[2]
+ vmlal.s16 q4, d21, d5[3]
+ vmlal.s16 q4, d22, d6[0]
+
+ vmlal.s16 q5, d19, d5[0]
+ vmlal.s16 q5, d20, d5[1]
+ vmlal.s16 q5, d21, d5[2]
+ vmlal.s16 q5, d22, d5[3]
+ vmlal.s16 q5, d23, d6[0]
+
+
+
+ vadd.i32 q4, q4, q15
+ vadd.i32 q5, q5, q15
+
+/* Narrow it to a d-reg 32 -> 16 bit */
+ vrshrn.i32 d8, q4, #8
+ vrshrn.i32 d9, q5, #8
+
+
+/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+ vqmovun.s16 d8, q4
+
+ vst1.8 d8, [r0]! @ return the output and increase the address of r0
+
+ /* Are we done? */
+ subs r6, r6, #1
+ bne 1b
+
+ /* Yup, bye */
+ vpop {q4-q7}
+ pop {r4-r7, lr}
+ bx lr
+
+END(rsdIntrinsicConvolve5x5_K)
diff --git a/toolkit/Histogram.cpp b/toolkit/Histogram.cpp
new file mode 100644
index 0000000..86b4bed
--- /dev/null
+++ b/toolkit/Histogram.cpp
@@ -0,0 +1,299 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <array>
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.Histogram"
+
+namespace android {
+namespace renderscript {
+
+class HistogramTask : public Task {
+ const uchar* mIn;
+ std::vector<int> mSums;
+ uint32_t mThreadCount;
+
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ void kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+ void kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+ void kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+ void kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+
+ public:
+ HistogramTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+ uint32_t threadCount, const Restriction* restriction);
+ void collateSums(int* out);
+};
+
+class HistogramDotTask : public Task {
+ const uchar* mIn;
+ float mDot[4];
+ int mDotI[4];
+ std::vector<int> mSums;
+ uint32_t mThreadCount;
+
+ void kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+ void kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+ void kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+ void kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+
+ public:
+ HistogramDotTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+ uint32_t threadCount, const float* coefficients,
+ const Restriction* restriction);
+ void collateSums(int* out);
+
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+};
+
+HistogramTask::HistogramTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+ uint32_t threadCount, const Restriction* restriction)
+ : Task{sizeX, sizeY, vectorSize, true, restriction},
+ mIn{in},
+ mSums(256 * paddedSize(vectorSize) * threadCount) {
+ mThreadCount = threadCount;
+}
+
+void HistogramTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ typedef void (HistogramTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
+
+ KernelFunction kernel;
+ switch (mVectorSize) {
+ case 4:
+ kernel = &HistogramTask::kernelP1U4;
+ break;
+ case 3:
+ kernel = &HistogramTask::kernelP1U3;
+ break;
+ case 2:
+ kernel = &HistogramTask::kernelP1U2;
+ break;
+ case 1:
+ kernel = &HistogramTask::kernelP1U1;
+ break;
+ default:
+ ALOGE("Bad vector size %zd", mVectorSize);
+ return;
+ }
+
+ int* sums = &mSums[256 * paddedSize(mVectorSize) * threadIndex];
+
+ for (size_t y = startY; y < endY; y++) {
+ const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
+ std::invoke(kernel, this, inPtr, sums, startX, endX);
+ }
+}
+
+void HistogramTask::kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+ for (uint32_t x = xstart; x < xend; x++) {
+ sums[(in[0] << 2)]++;
+ sums[(in[1] << 2) + 1]++;
+ sums[(in[2] << 2) + 2]++;
+ sums[(in[3] << 2) + 3]++;
+ in += 4;
+ }
+}
+
+void HistogramTask::kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+ for (uint32_t x = xstart; x < xend; x++) {
+ sums[(in[0] << 2)]++;
+ sums[(in[1] << 2) + 1]++;
+ sums[(in[2] << 2) + 2]++;
+ in += 4;
+ }
+}
+
+void HistogramTask::kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+ for (uint32_t x = xstart; x < xend; x++) {
+ sums[(in[0] << 1)]++;
+ sums[(in[1] << 1) + 1]++;
+ in += 2;
+ }
+}
+
+void HistogramTask::kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+ for (uint32_t x = xstart; x < xend; x++) {
+ sums[in[0]]++;
+ in++;
+ }
+}
+
+void HistogramTask::collateSums(int* out) {
+ for (uint32_t ct = 0; ct < (256 * paddedSize(mVectorSize)); ct++) {
+ out[ct] = mSums[ct];
+ for (uint32_t t = 1; t < mThreadCount; t++) {
+ out[ct] += mSums[ct + (256 * paddedSize(mVectorSize) * t)];
+ }
+ }
+}
+
+HistogramDotTask::HistogramDotTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+ uint32_t threadCount, const float* coefficients,
+ const Restriction* restriction)
+ : Task{sizeX, sizeY, vectorSize, true, restriction}, mIn{in}, mSums(256 * threadCount, 0) {
+ mThreadCount = threadCount;
+
+ if (coefficients == nullptr) {
+ mDot[0] = 0.299f;
+ mDot[1] = 0.587f;
+ mDot[2] = 0.114f;
+ mDot[3] = 0;
+ } else {
+ memcpy(mDot, coefficients, 16);
+ }
+ mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
+ mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
+ mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
+ mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
+}
+
+void HistogramDotTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ typedef void (HistogramDotTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
+
+ KernelFunction kernel;
+ switch (mVectorSize) {
+ case 4:
+ kernel = &HistogramDotTask::kernelP1L4;
+ break;
+ case 3:
+ kernel = &HistogramDotTask::kernelP1L3;
+ break;
+ case 2:
+ kernel = &HistogramDotTask::kernelP1L2;
+ break;
+ case 1:
+ kernel = &HistogramDotTask::kernelP1L1;
+ break;
+ default:
+ ALOGI("Bad vector size %zd", mVectorSize);
+ return;
+ }
+
+ int* sums = &mSums[256 * threadIndex];
+
+ for (size_t y = startY; y < endY; y++) {
+ const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
+ std::invoke(kernel, this, inPtr, sums, startX, endX);
+ }
+}
+
+void HistogramDotTask::kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+ for (uint32_t x = xstart; x < xend; x++) {
+ int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]) + (mDotI[3] * in[3]);
+ sums[(t + 0x7f) >> 8]++;
+ in += 4;
+ }
+}
+
+void HistogramDotTask::kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+ for (uint32_t x = xstart; x < xend; x++) {
+ int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]);
+ sums[(t + 0x7f) >> 8]++;
+ in += 4;
+ }
+}
+
+void HistogramDotTask::kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+ for (uint32_t x = xstart; x < xend; x++) {
+ int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]);
+ sums[(t + 0x7f) >> 8]++;
+ in += 2;
+ }
+}
+
+void HistogramDotTask::kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+ for (uint32_t x = xstart; x < xend; x++) {
+ int t = (mDotI[0] * in[0]);
+ sums[(t + 0x7f) >> 8]++;
+ in++;
+ }
+}
+
+void HistogramDotTask::collateSums(int* out) {
+ for (uint32_t ct = 0; ct < 256; ct++) {
+ out[ct] = mSums[ct];
+ for (uint32_t t = 1; t < mThreadCount; t++) {
+ out[ct] += mSums[ct + (256 * t)];
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+void RenderScriptToolkit::histogram(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
+ size_t vectorSize, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+ return;
+ }
+ if (vectorSize < 1 || vectorSize > 4) {
+ ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+ return;
+ }
+#endif
+
+ HistogramTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), restriction);
+ processor->doTask(&task);
+ task.collateSums(out);
+}
+
+void RenderScriptToolkit::histogramDot(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
+ size_t vectorSize, const float* coefficients,
+ const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+ return;
+ }
+ if (vectorSize < 1 || vectorSize > 4) {
+ ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+ return;
+ }
+ if (coefficients != nullptr) {
+ float sum = 0.0f;
+ for (size_t i = 0; i < vectorSize; i++) {
+ if (coefficients[i] < 0.0f) {
+ ALOGE("histogramDot coefficients should not be negative. Coefficient %zu was %f.",
+ i, coefficients[i]);
+ return;
+ }
+ sum += coefficients[i];
+ }
+ if (sum > 1.0f) {
+ ALOGE("histogramDot coefficients should add to 1 or less. Their sum is %f.", sum);
+ return;
+ }
+ }
+#endif
+
+ HistogramDotTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(),
+ coefficients, restriction);
+ processor->doTask(&task);
+ task.collateSums(out);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/JniEntryPoints.cpp b/toolkit/JniEntryPoints.cpp
new file mode 100644
index 0000000..3bf5911
--- /dev/null
+++ b/toolkit/JniEntryPoints.cpp
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <android/bitmap.h>
+#include <assert.h>
+#include <jni.h>
+
+#include "RenderScriptToolkit.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.JniEntryPoints"
+
+using namespace android::renderscript;
+
+/**
+ * I compared using env->GetPrimitiveArrayCritical vs. env->GetByteArrayElements to get access
+ * to the underlying data. On Pixel 4, it's actually faster to not use critical. The code is left
+ * here if you want to experiment. Note that USE_CRITICAL could block the garbage collector.
+ */
+// #define USE_CRITICAL
+
+class ByteArrayGuard {
+ private:
+ JNIEnv* env;
+ jbyteArray array;
+ jbyte* data;
+
+ public:
+ ByteArrayGuard(JNIEnv* env, jbyteArray array) : env{env}, array{array} {
+#ifdef USE_CRITICAL
+ data = reinterpret_cast<jbyte*>(env->GetPrimitiveArrayCritical(array, nullptr));
+#else
+ data = env->GetByteArrayElements(array, nullptr);
+#endif
+ }
+ ~ByteArrayGuard() {
+#ifdef USE_CRITICAL
+ env->ReleasePrimitiveArrayCritical(array, data, 0);
+#else
+ env->ReleaseByteArrayElements(array, data, 0);
+#endif
+ }
+ uint8_t* get() { return reinterpret_cast<uint8_t*>(data); }
+};
+
+class IntArrayGuard {
+ private:
+ JNIEnv* env;
+ jintArray array;
+ jint* data;
+
+ public:
+ IntArrayGuard(JNIEnv* env, jintArray array) : env{env}, array{array} {
+#ifdef USE_CRITICAL
+ data = reinterpret_cast<jint*>(env->GetPrimitiveArrayCritical(array, nullptr));
+#else
+ data = env->GetIntArrayElements(array, nullptr);
+#endif
+ }
+ ~IntArrayGuard() {
+#ifdef USE_CRITICAL
+ env->ReleasePrimitiveArrayCritical(array, data, 0);
+#else
+ env->ReleaseIntArrayElements(array, data, 0);
+#endif
+ }
+ int* get() { return reinterpret_cast<int*>(data); }
+};
+
+class FloatArrayGuard {
+ private:
+ JNIEnv* env;
+ jfloatArray array;
+ jfloat* data;
+
+ public:
+ FloatArrayGuard(JNIEnv* env, jfloatArray array) : env{env}, array{array} {
+#ifdef USE_CRITICAL
+ data = reinterpret_cast<jfloat*>(env->GetPrimitiveArrayCritical(array, nullptr));
+#else
+ data = env->GetFloatArrayElements(array, nullptr);
+#endif
+ }
+ ~FloatArrayGuard() {
+#ifdef USE_CRITICAL
+ env->ReleasePrimitiveArrayCritical(array, data, 0);
+#else
+ env->ReleaseFloatArrayElements(array, data, 0);
+#endif
+ }
+ float* get() { return reinterpret_cast<float*>(data); }
+};
+
+class BitmapGuard {
+ private:
+ JNIEnv* env;
+ jobject bitmap;
+ AndroidBitmapInfo info;
+ int bytesPerPixel;
+ void* bytes;
+ bool valid;
+
+ public:
+ BitmapGuard(JNIEnv* env, jobject jBitmap) : env{env}, bitmap{jBitmap}, bytes{nullptr} {
+ valid = false;
+ if (AndroidBitmap_getInfo(env, bitmap, &info) != ANDROID_BITMAP_RESULT_SUCCESS) {
+ ALOGE("AndroidBitmap_getInfo failed");
+ return;
+ }
+ if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888 &&
+ info.format != ANDROID_BITMAP_FORMAT_A_8) {
+ ALOGE("AndroidBitmap in the wrong format");
+ return;
+ }
+ bytesPerPixel = info.stride / info.width;
+ if (bytesPerPixel != 1 && bytesPerPixel != 4) {
+ ALOGE("Expected a vector size of 1 or 4. Got %d. Extra padding per line not currently "
+ "supported",
+ bytesPerPixel);
+ return;
+ }
+ if (AndroidBitmap_lockPixels(env, bitmap, &bytes) != ANDROID_BITMAP_RESULT_SUCCESS) {
+ ALOGE("AndroidBitmap_lockPixels failed");
+ return;
+ }
+ valid = true;
+ }
+ ~BitmapGuard() {
+ if (valid) {
+ AndroidBitmap_unlockPixels(env, bitmap);
+ }
+ }
+ uint8_t* get() const {
+ assert(valid);
+ return reinterpret_cast<uint8_t*>(bytes);
+ }
+ int width() const { return info.width; }
+ int height() const { return info.height; }
+ int vectorSize() const { return bytesPerPixel; }
+};
+
+/**
+ * Copies the content of Kotlin Range2d object into the equivalent C++ struct.
+ */
+class RestrictionParameter {
+ private:
+ bool isNull;
+ Restriction restriction;
+
+ public:
+ RestrictionParameter(JNIEnv* env, jobject jRestriction) : isNull{jRestriction == nullptr} {
+ if (isNull) {
+ return;
+ }
+ /* TODO Measure how long FindClass and related functions take. Consider passing the
+ * four values instead. This would also require setting the default when Range2D is null.
+ */
+ jclass restrictionClass = env->FindClass("android/renderscript/toolkit/Range2d");
+ if (restrictionClass == nullptr) {
+ ALOGE("RenderScriptToolit. Internal error. Could not find the Kotlin Range2d class.");
+ isNull = true;
+ return;
+ }
+ jfieldID startXId = env->GetFieldID(restrictionClass, "startX", "I");
+ jfieldID startYId = env->GetFieldID(restrictionClass, "startY", "I");
+ jfieldID endXId = env->GetFieldID(restrictionClass, "endX", "I");
+ jfieldID endYId = env->GetFieldID(restrictionClass, "endY", "I");
+ restriction.startX = env->GetIntField(jRestriction, startXId);
+ restriction.startY = env->GetIntField(jRestriction, startYId);
+ restriction.endX = env->GetIntField(jRestriction, endXId);
+ restriction.endY = env->GetIntField(jRestriction, endYId);
+ }
+ Restriction* get() { return isNull ? nullptr : &restriction; }
+};
+
+extern "C" JNIEXPORT jlong JNICALL
+Java_android_renderscript_toolkit_Toolkit_createNative(JNIEnv* /*env*/, jobject /*thiz*/) {
+ return reinterpret_cast<jlong>(new RenderScriptToolkit());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_destroyNative(
+ JNIEnv* /*env*/, jobject /*thiz*/, jlong native_handle) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ delete toolkit;
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlend(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jint jmode, jbyteArray source_array,
+ jbyteArray dest_array, jint size_x, jint size_y, jobject restriction) {
+ auto toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ auto mode = static_cast<RenderScriptToolkit::BlendingMode>(jmode);
+ RestrictionParameter restrict {env, restriction};
+ ByteArrayGuard source{env, source_array};
+ ByteArrayGuard dest{env, dest_array};
+
+ toolkit->blend(mode, source.get(), dest.get(), size_x, size_y, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlendBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jint jmode, jobject source_bitmap,
+ jobject dest_bitmap, jobject restriction) {
+ auto toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ auto mode = static_cast<RenderScriptToolkit::BlendingMode>(jmode);
+ RestrictionParameter restrict {env, restriction};
+ BitmapGuard source{env, source_bitmap};
+ BitmapGuard dest{env, dest_bitmap};
+
+ toolkit->blend(mode, source.get(), dest.get(), source.width(), source.height(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlur(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint vectorSize,
+ jint size_x, jint size_y, jint radius, jbyteArray output_array, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ ByteArrayGuard input{env, input_array};
+ ByteArrayGuard output{env, output_array};
+
+ toolkit->blur(input.get(), output.get(), size_x, size_y, vectorSize, radius, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlurBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+ jobject output_bitmap, jint radius, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ BitmapGuard input{env, input_bitmap};
+ BitmapGuard output{env, output_bitmap};
+
+ toolkit->blur(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
+ radius, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeColorMatrix(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+ jint input_vector_size, jint size_x, jint size_y, jbyteArray output_array,
+ jint output_vector_size, jfloatArray jmatrix, jfloatArray add_vector, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ ByteArrayGuard input{env, input_array};
+ ByteArrayGuard output{env, output_array};
+ FloatArrayGuard matrix{env, jmatrix};
+ FloatArrayGuard add{env, add_vector};
+
+ toolkit->colorMatrix(input.get(), output.get(), input_vector_size, output_vector_size, size_x,
+ size_y, matrix.get(), add.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeColorMatrixBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+ jobject output_bitmap, jfloatArray jmatrix, jfloatArray add_vector, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ BitmapGuard input{env, input_bitmap};
+ BitmapGuard output{env, output_bitmap};
+ FloatArrayGuard matrix{env, jmatrix};
+ FloatArrayGuard add{env, add_vector};
+
+ toolkit->colorMatrix(input.get(), output.get(), input.vectorSize(), output.vectorSize(),
+ input.width(), input.height(), matrix.get(), add.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeConvolve(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint vectorSize,
+ jint size_x, jint size_y, jbyteArray output_array, jfloatArray coefficients,
+ jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ ByteArrayGuard input{env, input_array};
+ ByteArrayGuard output{env, output_array};
+ FloatArrayGuard coeffs{env, coefficients};
+
+ switch (env->GetArrayLength(coefficients)) {
+ case 9:
+ toolkit->convolve3x3(input.get(), output.get(), vectorSize, size_x, size_y,
+ coeffs.get(), restrict.get());
+ break;
+ case 25:
+ toolkit->convolve5x5(input.get(), output.get(), vectorSize, size_x, size_y,
+ coeffs.get(), restrict.get());
+ break;
+ }
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeConvolveBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+ jobject output_bitmap, jfloatArray coefficients, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ BitmapGuard input{env, input_bitmap};
+ BitmapGuard output{env, output_bitmap};
+ FloatArrayGuard coeffs{env, coefficients};
+
+ switch (env->GetArrayLength(coefficients)) {
+ case 9:
+ toolkit->convolve3x3(input.get(), output.get(), input.vectorSize(), input.width(),
+ input.height(), coeffs.get(), restrict.get());
+ break;
+ case 25:
+ toolkit->convolve5x5(input.get(), output.get(), input.vectorSize(), input.width(),
+ input.height(), coeffs.get(), restrict.get());
+ break;
+ }
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogram(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+ jint vector_size, jint size_x, jint size_y, jintArray output_array, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ ByteArrayGuard input{env, input_array};
+ IntArrayGuard output{env, output_array};
+
+ toolkit->histogram(input.get(), output.get(), size_x, size_y, vector_size, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogramBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+ jintArray output_array, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ BitmapGuard input{env, input_bitmap};
+ IntArrayGuard output{env, output_array};
+
+ toolkit->histogram(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
+ restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogramDot(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+ jint vector_size, jint size_x, jint size_y, jintArray output_array,
+ jfloatArray coefficients, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ ByteArrayGuard input{env, input_array};
+ IntArrayGuard output{env, output_array};
+ FloatArrayGuard coeffs{env, coefficients};
+
+ toolkit->histogramDot(input.get(), output.get(), size_x, size_y, vector_size, coeffs.get(),
+ restrict.get());
+}
+
+extern "C" JNIEXPORT
+void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogramDotBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+ jintArray output_array, jfloatArray coefficients, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ BitmapGuard input{env, input_bitmap};
+ IntArrayGuard output{env, output_array};
+ FloatArrayGuard coeffs{env, coefficients};
+
+ toolkit->histogramDot(input.get(), output.get(), input.width(), input.height(),
+ input.vectorSize(), coeffs.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLut(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+ jbyteArray output_array, jint size_x, jint size_y, jbyteArray red_table,
+ jbyteArray green_table, jbyteArray blue_table, jbyteArray alpha_table,
+ jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+
+ ByteArrayGuard input{env, input_array};
+ ByteArrayGuard output{env, output_array};
+ ByteArrayGuard red{env, red_table};
+ ByteArrayGuard green{env, green_table};
+ ByteArrayGuard blue{env, blue_table};
+ ByteArrayGuard alpha{env, alpha_table};
+
+ toolkit->lut(input.get(), output.get(), size_x, size_y, red.get(), green.get(), blue.get(),
+ alpha.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLutBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+ jobject output_bitmap, jbyteArray red_table, jbyteArray green_table, jbyteArray blue_table,
+ jbyteArray alpha_table, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+
+ BitmapGuard input{env, input_bitmap};
+ BitmapGuard output{env, output_bitmap};
+ ByteArrayGuard red{env, red_table};
+ ByteArrayGuard green{env, green_table};
+ ByteArrayGuard blue{env, blue_table};
+ ByteArrayGuard alpha{env, alpha_table};
+
+ toolkit->lut(input.get(), output.get(), input.width(), input.height(), red.get(), green.get(),
+ blue.get(), alpha.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLut3d(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+ jbyteArray output_array, jint size_x, jint size_y, jbyteArray cube_values, jint cubeSizeX,
+ jint cubeSizeY, jint cubeSizeZ, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ ByteArrayGuard input{env, input_array};
+ ByteArrayGuard output{env, output_array};
+ ByteArrayGuard cube{env, cube_values};
+
+ toolkit->lut3d(input.get(), output.get(), size_x, size_y, cube.get(), cubeSizeX, cubeSizeY,
+ cubeSizeZ, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLut3dBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+ jobject output_bitmap, jbyteArray cube_values, jint cubeSizeX, jint cubeSizeY,
+ jint cubeSizeZ, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ BitmapGuard input{env, input_bitmap};
+ BitmapGuard output{env, output_bitmap};
+ ByteArrayGuard cube{env, cube_values};
+
+ toolkit->lut3d(input.get(), output.get(), input.width(), input.height(), cube.get(), cubeSizeX,
+ cubeSizeY, cubeSizeZ, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeResize(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+ jint vector_size, jint input_size_x, jint input_size_y, jbyteArray output_array,
+ jint output_size_x, jint output_size_y, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ ByteArrayGuard input{env, input_array};
+ ByteArrayGuard output{env, output_array};
+
+ toolkit->resize(input.get(), output.get(), input_size_x, input_size_y, vector_size,
+ output_size_x, output_size_y, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeResizeBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+ jobject output_bitmap, jobject restriction) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ RestrictionParameter restrict {env, restriction};
+ BitmapGuard input{env, input_bitmap};
+ BitmapGuard output{env, output_bitmap};
+
+ toolkit->resize(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
+ output.width(), output.height(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeYuvToRgb(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+ jbyteArray output_array, jint size_x, jint size_y, jint format) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ ByteArrayGuard input{env, input_array};
+ ByteArrayGuard output{env, output_array};
+
+ toolkit->yuvToRgb(input.get(), output.get(), size_x, size_y,
+ static_cast<RenderScriptToolkit::YuvFormat>(format));
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeYuvToRgbBitmap(
+ JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint size_x,
+ jint size_y, jobject output_bitmap, jint format) {
+ RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+ BitmapGuard output{env, output_bitmap};
+ ByteArrayGuard input{env, input_array};
+
+ toolkit->yuvToRgb(input.get(), output.get(), size_x, size_y,
+ static_cast<RenderScriptToolkit::YuvFormat>(format));
+}
diff --git a/toolkit/Lut.cpp b/toolkit/Lut.cpp
new file mode 100644
index 0000000..4ac5cdc
--- /dev/null
+++ b/toolkit/Lut.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.Lut"
+
+namespace android {
+namespace renderscript {
+
+class LutTask : public Task {
+ const uchar4* mIn;
+ uchar4* mOut;
+ const uchar* mRedTable;
+ const uchar* mGreenTable;
+ const uchar* mBlueTable;
+ const uchar* mAlphaTable;
+
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ public:
+ LutTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY, const uint8_t* red,
+ const uint8_t* green, const uint8_t* blue, const uint8_t* alpha,
+ const Restriction* restriction)
+ : Task{sizeX, sizeY, 4, true, restriction},
+ mIn{reinterpret_cast<const uchar4*>(input)},
+ mOut{reinterpret_cast<uchar4*>(output)},
+ mRedTable{red},
+ mGreenTable{green},
+ mBlueTable{blue},
+ mAlphaTable{alpha} {}
+};
+
+void LutTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ for (size_t y = startY; y < endY; y++) {
+ size_t offset = mSizeX * y + startX;
+ const uchar4* in = mIn + offset;
+ uchar4* out = mOut + offset;
+ for (size_t x = startX; x < endX; x++) {
+ auto v = *in;
+ *out = uchar4{mRedTable[v.x], mGreenTable[v.y], mBlueTable[v.z], mAlphaTable[v.w]};
+ in++;
+ out++;
+ }
+ }
+}
+
+void RenderScriptToolkit::lut(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+ const uint8_t* red, const uint8_t* green, const uint8_t* blue,
+ const uint8_t* alpha, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+ return;
+ }
+#endif
+
+ LutTask task(input, output, sizeX, sizeY, red, green, blue, alpha, restriction);
+ processor->doTask(&task);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/Lut3d.cpp b/toolkit/Lut3d.cpp
new file mode 100644
index 0000000..f8a7d61
--- /dev/null
+++ b/toolkit/Lut3d.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Lut3d"
+
+/**
+ * Converts a RGBA buffer using a 3D cube.
+ */
+class Lut3dTask : public Task {
+ // The input array we're transforming.
+ const uchar4* mIn;
+ // Where we'll store the transformed result.
+ uchar4* mOut;
+ // The size of each of the three cube dimensions. We don't make use of the last value.
+ int4 mCubeDimension;
+ // The translation cube, in row major format.
+ const uchar* mCubeTable;
+
+ /**
+ * Converts a subset of a line of the 2D buffer.
+ *
+ * @param in The start of the data to transform.
+ * @param out Where to store the result.
+ * @param length The number of 4-byte vectors to transform.
+ */
+ void kernel(const uchar4* in, uchar4* out, uint32_t length);
+
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ public:
+ Lut3dTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+ const uint8_t* cube, int cubeSizeX, int cubeSizeY, int cubeSizeZ,
+ const Restriction* restriction)
+ : Task{sizeX, sizeY, 4, true, restriction},
+ mIn{reinterpret_cast<const uchar4*>(input)},
+ mOut{reinterpret_cast<uchar4*>(output)},
+ mCubeDimension{cubeSizeX, cubeSizeY, cubeSizeZ, 0},
+ mCubeTable{cube} {}
+};
+
+extern "C" void rsdIntrinsic3DLUT_K(void* dst, void const* in, size_t count, void const* lut,
+ int32_t pitchy, int32_t pitchz, int dimx, int dimy, int dimz);
+
+void Lut3dTask::kernel(const uchar4* in, uchar4* out, uint32_t length) {
+ uint32_t x1 = 0;
+ uint32_t x2 = length;
+
+ const uchar* bp = mCubeTable;
+
+ int4 dims = mCubeDimension - 1;
+
+ const float4 m = (float4)(1.f / 255.f) * convert<float4>(dims);
+ const int4 coordMul = convert<int4>(m * (float4)0x8000);
+ const size_t stride_y = mCubeDimension.x * 4;
+ const size_t stride_z = stride_y * mCubeDimension.y;
+
+ // ALOGE("strides %zu %zu", stride_y, stride_z);
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+ if (mUsesSimd) {
+ int32_t len = x2 - x1;
+ if (len > 0) {
+ rsdIntrinsic3DLUT_K(out, in, len, bp, stride_y, stride_z, dims.x, dims.y, dims.z);
+ x1 += len;
+ out += len;
+ in += len;
+ }
+ }
+#endif
+
+ while (x1 < x2) {
+ int4 baseCoord = convert<int4>(*in) * coordMul;
+ int4 coord1 = baseCoord >> (int4)15;
+ // int4 coord2 = min(coord1 + 1, gDims - 1);
+
+ int4 weight2 = baseCoord & 0x7fff;
+ int4 weight1 = (int4)0x8000 - weight2;
+
+ // ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+ const uchar* bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
+ const uchar4* pt_00 = (const uchar4*)&bp2[0];
+ const uchar4* pt_10 = (const uchar4*)&bp2[stride_y];
+ const uchar4* pt_01 = (const uchar4*)&bp2[stride_z];
+ const uchar4* pt_11 = (const uchar4*)&bp2[stride_y + stride_z];
+
+ uint4 v000 = convert<uint4>(pt_00[0]);
+ uint4 v100 = convert<uint4>(pt_00[1]);
+ uint4 v010 = convert<uint4>(pt_10[0]);
+ uint4 v110 = convert<uint4>(pt_10[1]);
+ uint4 v001 = convert<uint4>(pt_01[0]);
+ uint4 v101 = convert<uint4>(pt_01[1]);
+ uint4 v011 = convert<uint4>(pt_11[0]);
+ uint4 v111 = convert<uint4>(pt_11[1]);
+
+ uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
+ uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
+ uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
+ uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
+
+ uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15;
+ uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15;
+
+ uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15;
+ uint4 v2 = (v + 0x7f) >> (int4)8;
+
+ uchar4 ret = convert<uchar4>(v2);
+ ret.w = in->w;
+
+#if 0
+ if (!x1) {
+ ALOGE("in %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
+ ALOGE("baseCoord %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z,
+ baseCoord.w);
+ ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+ ALOGE("weight1 %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
+ ALOGE("weight2 %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
+
+ ALOGE("v000 %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
+ ALOGE("v100 %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
+ ALOGE("yz00 %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
+ ALOGE("z0 %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
+
+ ALOGE("v %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
+ ALOGE("v2 %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
+ }
+#endif
+ *out = ret;
+
+ in++;
+ out++;
+ x1++;
+ }
+}
+
+void Lut3dTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ for (size_t y = startY; y < endY; y++) {
+ size_t offset = mSizeX * y + startX;
+ kernel(mIn + offset, mOut + offset, endX - startX);
+ }
+}
+
+void RenderScriptToolkit::lut3d(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+ const uint8_t* cube, size_t cubeSizeX, size_t cubeSizeY,
+ size_t cubeSizeZ, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+ return;
+ }
+#endif
+
+ Lut3dTask task(input, output, sizeX, sizeY, cube, cubeSizeX, cubeSizeY, cubeSizeZ, restriction);
+ processor->doTask(&task);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/Lut3d_advsimd.S b/toolkit/Lut3d_advsimd.S
new file mode 100644
index 0000000..edcb038
--- /dev/null
+++ b/toolkit/Lut3d_advsimd.S
@@ -0,0 +1,250 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
+
+ smov x6, \src0
+ smov x7, \src1
+
+ add x6, x6, x3
+ add x7, x7, x3
+
+ ld1 {v16.2s}, [x6], x4
+ ld1 {v17.2s}, [x7], x4
+
+ ld1 {v18.2s}, [x6], x5
+ ld1 {v19.2s}, [x7], x5
+
+ dup v8.8b, \yr0
+ dup v9.8b, \yr1
+ /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
+ zip1 v12.16b, v5.16b, v16.16b
+ zip1 v13.16b, v5.16b, v17.16b
+ umlsl v12.8h, v16.8b, v8.8b
+ umlsl v13.8h, v17.8b, v9.8b
+ umlal v12.8h, v18.8b, v8.8b
+ umlal v13.8h, v19.8b, v9.8b
+
+ ld1 {v18.2s}, [x6]
+ ld1 {v19.2s}, [x7]
+
+ sub x6, x6, x4
+ sub x7, x7, x4
+
+ ld1 {v16.2s}, [x6]
+ ld1 {v17.2s}, [x7]
+
+ /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
+ zip1 v14.16b, v5.16b, v16.16b
+ zip1 v15.16b, v5.16b, v17.16b
+ umlsl v14.8h, v16.8b, v8.8b
+ umlsl v15.8h, v17.8b, v9.8b
+ umlal v14.8h, v18.8b, v8.8b
+ umlal v15.8h, v19.8b, v9.8b
+
+ /* Z interpolate, lane 0 v12/v14 -> v10 */
+ ushll v8.4s, v12.4h, #8
+ ushll2 v9.4s, v12.8h, #8
+ umlsl v8.4s, v12.4h, \zr0
+ umlsl2 v9.4s, v12.8h, \zr0
+ umlal v8.4s, v14.4h, \zr0
+ umlal2 v9.4s, v14.8h, \zr0
+ rshrn v10.4h, v8.4s, #8
+ rshrn2 v10.8h, v9.4s, #8
+
+ /* Z interpolate, lane 1 v13/v15 -> v11 */
+ ushll v8.4s, v13.4h, #8
+ ushll2 v9.4s, v13.8h, #8
+ umlsl v8.4s, v13.4h, \zr1
+ umlsl2 v9.4s, v13.8h, \zr1
+ umlal v8.4s, v15.4h, \zr1
+ umlal2 v9.4s, v15.8h, \zr1
+ rshrn v11.4h, v8.4s, #8
+ rshrn2 v11.8h, v9.4s, #8
+
+ /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
+ ushll v8.4s, v10.4h, #8
+ ushll v9.4s, v11.4h, #8
+ umlsl v8.4s, v10.4h, \xr0
+ umlsl v9.4s, v11.4h, \xr1
+ umlal2 v8.4s, v10.8h, \xr0
+ umlal2 v9.4s, v11.8h, \xr1
+ shrn v14.4h, v8.4s, #8
+ shrn2 v14.8h, v9.4s, #8
+
+ /* pack lanes 0-1 -> v6 */
+.ifc \dst, v20.16b
+ uqrshrn2 \dst, v14.8h, #8
+.else ; .ifc \dst, v21.16b
+ uqrshrn2 \dst, v14.8h, #8
+.else
+ uqrshrn \dst, v14.8h, #8
+.endif ; .endif
+.endm
+
+/* void rsdIntrinsic3DLUT_K(
+ * void *dst, // x0
+ * void const *in, // x1
+ * size_t count, // x2
+ * void const *lut, // x3
+ * int32_t pitchy, // w4
+ * int32_t pitchz, // w5
+ * int dimx, // w6
+ * int dimy, // w7
+ * int dimz); // [sp]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+ ldr w8, [sp]
+ stp d8, d9, [sp, #-64]!
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+ movi v4.8b, #1
+ ins v4.h[0], w6
+ ins v4.h[1], w7
+ ins v4.h[2], w8
+ ins v4.s[2], w4
+ ins v4.s[3], w5
+ movi v5.16b, #0
+
+ subs x2, x2, #8
+ bge 2f
+ cmn x2, #8 // same as cmp x2, #-8
+ ble 9f
+ b 4f
+
+ .align 6
+1: st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+/* x0 = dst
+ * x1 = src
+ * x2 = count
+ * x3 = lut
+ * x4 = pitchy
+ * x5 = pitchz
+ * x6 = offset0
+ * x7 = offset1
+ */
+2: ld4 {v0.8b-v3.8b}, [x1], #32
+/* v0,v1,v2,v3 source data
+ * v4 dimensions and pitches
+ */
+3: uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ mul v0.8h, v0.8h, v4.h[0]
+ mul v1.8h, v1.8h, v4.h[1]
+ mul v2.8h, v2.8h, v4.h[2]
+
+/* ursra below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero. Strictly this is
+ * correct, except for the llegal access problem.
+ */
+ usra v0.8h, v0.8h, #8
+ usra v1.8h, v1.8h, #8
+ usra v2.8h, v2.8h, #8
+
+ ushr v12.8h, v0.8h, #8
+ ushr v13.8h, v1.8h, #8
+ ushr v14.8h, v2.8h, #8
+ bic v0.8h, #0xff, LSL #8
+ xtn v1.8b, v1.8h
+ bic v2.8h, #0xff, LSL #8
+
+/* v0.8h,v1.8b,v2.hb fractional offset
+ * v12.8h,v13.8h,v14.8h integer offset
+ */
+
+ ushll v6.4s, v12.4h, #2
+ ushll2 v7.4s, v12.8h, #2
+ uxtl v8.4s, v13.4h
+ uxtl2 v9.4s, v13.8h
+ uxtl v10.4s, v14.4h
+ uxtl2 v11.4s, v14.8h
+ mla v6.4s, v8.4s, v4.s[2]
+ mla v7.4s, v9.4s, v4.s[2]
+ mla v6.4s, v10.4s, v4.s[3]
+ mla v7.4s, v11.4s, v4.s[3]
+
+/* v6,v7 list of table offsets */
+
+ /* lanes 0 and 1 */
+ lanepair dst=v20.8b, src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
+
+ /* lanes 2 and 3 */
+ lanepair dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
+
+ /* lanes 4 and 5 */
+ lanepair dst=v21.8b, src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
+
+ /* lanes 6 and 7 */
+ lanepair dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
+
+ uzp1 v6.16b, v20.16b, v21.16b
+ uzp2 v7.16b, v20.16b, v21.16b
+ uzp1 v20.16b, v6.16b, v7.16b
+ uzp2 v22.16b, v6.16b, v7.16b
+ mov v21.d[0], v20.d[1]
+
+ subs x2, x2, #8
+ mov v23.8b, v3.8b
+
+ bge 1b
+
+ cmn x2, #8 // same as cmp x2, #-8
+ blt 1f
+
+ st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+ beq 9f
+
+ /* fill the vector with a safe value */
+4: ld4r {v0.8b-v3.8b}, [x1]
+ tbz x2, #2, 2f
+ ld4 {v0.b-v3.b}[0], [x1], #4
+ ld4 {v0.b-v3.b}[1], [x1], #4
+ ld4 {v0.b-v3.b}[2], [x1], #4
+ ld4 {v0.b-v3.b}[3], [x1], #4
+2: tbz x2, #1, 2f
+ ld4 {v0.b-v3.b}[4], [x1], #4
+ ld4 {v0.b-v3.b}[5], [x1], #4
+2: tbz x2, #0, 2f
+ ld4 {v0.b-v3.b}[6], [x1], #4
+2: b 3b
+
+1: tst x2, #4
+ beq 2f
+ st4 {v20.b-v23.b}[0], [x0], #4
+ st4 {v20.b-v23.b}[1], [x0], #4
+ st4 {v20.b-v23.b}[2], [x0], #4
+ st4 {v20.b-v23.b}[3], [x0], #4
+2: tst x2, #2
+ beq 2f
+ st4 {v20.b-v23.b}[4], [x0], #4
+ st4 {v20.b-v23.b}[5], [x0], #4
+2: tst x2, #1
+ beq 9f
+ st4 {v20.b-v23.b}[6], [x0], #4
+
+9: ldp d14, d15, [sp, #48]
+ ldp d12, d13, [sp, #32]
+ ldp d10, d11, [sp, #16]
+ ldp d8, d9, [sp], #64
+ ret
+END(rsdIntrinsic3DLUT_K)
diff --git a/toolkit/Lut3d_neon.S b/toolkit/Lut3d_neon.S
new file mode 100644
index 0000000..9590f9c
--- /dev/null
+++ b/toolkit/Lut3d_neon.S
@@ -0,0 +1,256 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+.macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
+
+ vmov r6, r7, \src
+
+ add r6, r6, r3
+ add r7, r7, r3
+
+ vld1.u8 d16, [r6], r4
+ vld1.u8 d17, [r7], r4
+
+ vld1.u8 d18, [r6], r5
+ vld1.u8 d19, [r7], r5
+
+ vdup.u8 d6, \yr0
+ vdup.u8 d7, \yr1
+ /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */
+ vshll.u8 q12, d16, #8
+ vshll.u8 q13, d17, #8
+ vmlsl.u8 q12, d16, d6
+ vmlsl.u8 q13, d17, d7
+ vmlal.u8 q12, d18, d6
+ vmlal.u8 q13, d19, d7
+
+ vld1.u8 d18, [r6]
+ vld1.u8 d19, [r7]
+
+ sub r6, r6, r4
+ sub r7, r7, r4
+
+ vld1.u8 d16, [r6]
+ vld1.u8 d17, [r7]
+
+ /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */
+ vshll.u8 q14, d16, #8
+ vshll.u8 q15, d17, #8
+ vmlsl.u8 q14, d16, d6
+ vmlsl.u8 q15, d17, d7
+ vmlal.u8 q14, d18, d6
+ vmlal.u8 q15, d19, d7
+
+ /* Z interpolate, lane 0 q12/q14 -> q10 */
+ vshll.u16 q8, d24, #8
+ vshll.u16 q9, d25, #8
+ vmlsl.u16 q8, d24, \zr0
+ vmlsl.u16 q9, d25, \zr0
+ vmlal.u16 q8, d28, \zr0
+ vmlal.u16 q9, d29, \zr0
+ vrshrn.u32 d20, q8, #8
+ vrshrn.u32 d21, q9, #8
+
+ /* Z interpolate, lane 1 q13/q15 -> q11 */
+ vshll.u16 q8, d26, #8
+ vshll.u16 q9, d27, #8
+ vmlsl.u16 q8, d26, \zr1
+ vmlsl.u16 q9, d27, \zr1
+ vmlal.u16 q8, d30, \zr1
+ vmlal.u16 q9, d31, \zr1
+ vrshrn.u32 d22, q8, #8
+ vrshrn.u32 d23, q9, #8
+
+ /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */
+ vshll.u16 q8, d20, #8
+ vshll.u16 q9, d22, #8
+ vmlsl.u16 q8, d20, \xr0
+ vmlsl.u16 q9, d22, \xr1
+ vmlal.u16 q8, d21, \xr0
+ vmlal.u16 q9, d23, \xr1
+ vshrn.u32 d28, q8, #8
+ vshrn.u32 d29, q9, #8
+
+ /* pack lanes 0-1 -> d12 */
+ vqrshrn.u16 \dst, q14, #8
+.endm
+
+/* void rsdIntrinsic3DLUT_K(
+ * void *dst, // r0
+ * void const *in, // r1
+ * size_t count, // r2
+ * void const *lut, // r3
+ * int32_t pitchy, // [sp]
+ * int32_t pitchz, // [sp+#4]
+ * int dimx, // [sp+#8]
+ * int dimy, // [sp+#12]
+ * int dimz); // [sp+#16]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+ push {r4,r5,r6,r7}
+ ldr r4, [sp, #16]
+ ldr r5, [sp, #20]
+ ldr r6, [sp, #24]
+ ldr r7, [sp, #28]
+ ldr r12, [sp, #32]
+ vpush {d8-d15}
+
+ vmov.u8 d8, #1
+ vmov.u16 d8[0], r6
+ vmov.u16 d8[1], r7
+ vmov.u16 d8[2], r12
+ vmov d9, r4, r5
+
+ subs r2, #8
+ bge 2f
+ cmp r2, #-8
+ ble 9f
+ b 4f
+
+ .align 6
+1: vst4.u8 {d12,d13,d14,d15}, [r0]!
+/* r0 = dst
+ * r1 = src
+ * r2 = count
+ * r3 = lut
+ * r4 = pitchy
+ * r5 = pitchz
+ * r6 = offset0
+ * r7 = offset1
+ */
+2: vld4.u8 {d0,d2,d4,d6}, [r1]!
+3: vmov d10, d6
+/* q0,q1,q2,q5 source data
+ * q4 dimensions and pitches
+ * q3, scratch register for scalar access
+ */
+ vmov q3, q4
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vmul.u16 q0, q0, d6[0]
+ vmul.u16 q1, q1, d6[1]
+ vmul.u16 q2, q2, d6[2]
+
+/* vrsra.u16 below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero. Strictly this is
+ * correct, except for the llegal access problem.
+ */
+ vsra.u16 q0, q0, #8
+ vsra.u16 q1, q1, #8
+ vsra.u16 q2, q2, #8
+
+ vshr.u16 q12, q0, #8
+ vshr.u16 q13, q1, #8
+ vshr.u16 q14, q2, #8
+
+ vbic.u16 q0, #0xff00
+ vmovn.u16 d2, q1
+ vbic.u16 q2, #0xff00
+
+/* q0,d2,q2 fractional offset
+ * q12,q13,q14 integer offset
+ */
+
+ vshll.u16 q6, d24, #2
+ vshll.u16 q7, d25, #2
+ vmovl.u16 q8, d26
+ vmovl.u16 q9, d27
+ vmovl.u16 q10, d28
+ vmovl.u16 q11, d29
+ vmla.s32 q6, q8, d9[0]
+ vmla.s32 q7, q9, d9[0]
+ vmla.s32 q6, q10, d9[1]
+ vmla.s32 q7, q11, d9[1]
+
+/* q6,q7 list of table offsets */
+
+ /* lanes 0 and 1 */
+ lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1]
+
+ /* lanes 2 and 3 */
+ lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3]
+
+ /* lanes 4 and 5 */
+ lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1]
+
+ /* lanes 6 and 7 */
+ lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3]
+
+ vuzp.u8 d12, d13
+ vuzp.u8 d14, d15
+ vuzp.u8 d12, d14
+ vuzp.u8 d13, d15
+
+ subs r2, r2, #8
+ vmov.u8 d15, d10
+
+ bge 1b
+
+ cmp r2, #-8
+ blt 1f
+
+ vst4.u8 {d12,d13,d14,d15}, [r0]!
+
+ beq 9f
+
+ /* fill the vector with a safe value */
+4: vld1.u32 {d0[]}, [r1]
+ vmov d2, d0
+ vmov d4, d0
+ vmov d6, d0
+ tst r2, #4
+ beq 2f
+ vld1.u32 {d0}, [r1]!
+ vld1.u32 {d2}, [r1]!
+2: tst r2, #2
+ beq 2f
+ vld1.u32 {d4}, [r1]!
+2: tst r2, #1
+ beq 2f
+ vld1.u32 {d6[0]}, [r1]!
+2: vuzp.8 d0, d2
+ vuzp.8 d4, d6
+ vuzp.8 d0, d4
+ vuzp.8 d2, d6
+ b 3b
+
+1: vzip.8 d12, d14
+ vzip.8 d13, d15
+ vzip.8 d12, d13
+ vzip.8 d14, d15
+ tst r2, #4
+ beq 2f
+ vst1.u32 {d12,d13}, [r0]!
+2: tst r2, #2
+ beq 2f
+ vst1.u32 {d14}, [r0]!
+2: tst r2, #1
+ beq 9f
+ vst1.u32 {d15[0]}, [r0]!
+
+9: mov r0, #0
+ vpop {d8-d15}
+ pop {r4,r5,r6,r7}
+ bx lr
+END(rsdIntrinsic3DLUT_K)
diff --git a/toolkit/README.txt b/toolkit/README.txt
new file mode 100644
index 0000000..4e08dc5
--- /dev/null
+++ b/toolkit/README.txt
@@ -0,0 +1,9 @@
+This directory will contain the standalone library meant to replace the RenderScript Intrinsics.
+
+The work in this directory is not complete.
+
+To make the review process manageable, a series of smaller CLs will be reviewed and submitted.
+
+While it is initially built with Soong, the end goal is to move this to github once the work
+has been completed. This is a staging area for the reviews.
+
diff --git a/toolkit/RenderScriptToolkit.cpp b/toolkit/RenderScriptToolkit.cpp
new file mode 100644
index 0000000..f110317
--- /dev/null
+++ b/toolkit/RenderScriptToolkit.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RenderScriptToolkit.h"
+
+#include "TaskProcessor.h"
+
+#define LOG_TAG "renderscript.toolkit.RenderScriptToolkit"
+
+namespace android {
+namespace renderscript {
+
+// You will find the implementation of the various transformations in the correspondingly
+// named source file. E.g. RenderScriptToolkit::blur() is found in Blur.cpp.
+
+RenderScriptToolkit::RenderScriptToolkit(int numberOfThreads)
+ : processor{new TaskProcessor(numberOfThreads)} {}
+
+RenderScriptToolkit::~RenderScriptToolkit() {
+ // By defining the destructor here, we don't need to include TaskProcessor.h
+ // in RenderScriptToolkit.h.
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/RenderScriptToolkit.h b/toolkit/RenderScriptToolkit.h
new file mode 100644
index 0000000..fb33195
--- /dev/null
+++ b/toolkit/RenderScriptToolkit.h
@@ -0,0 +1,540 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
+#define ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
+
+#include <cstdint>
+#include <memory>
+
+namespace android {
+namespace renderscript {
+
+class TaskProcessor;
+
+/**
+ * Define a range of data to process.
+ *
+ * This class is used to restrict a Toolkit operation to a rectangular subset of the input
+ * tensor.
+ *
+ * @property startX The index of the first value to be included on the X axis.
+ * @property endX The index after the last value to be included on the X axis.
+ * @property startY The index of the first value to be included on the Y axis.
+ * @property endY The index after the last value to be included on the Y axis.
+ */
+struct Restriction {
+ size_t startX;
+ size_t endX;
+ size_t startY;
+ size_t endY;
+};
+
+/**
+ * A collection of high-performance graphic utility functions like blur and blend.
+ *
+ * This toolkit provides ten image manipulation functions: blend, blur, color matrix, convolve,
+ * histogram, histogramDot, lut, lut3d, resize, and YUV to RGB. These functions execute
+ * multithreaded on the CPU.
+ *
+ * These functions work over raw byte arrays. You'll need to specify the width and height of
+ * the data to be processed, as well as the number of bytes per pixel. For most use cases,
+ * this will be 4.
+ *
+ * You should instantiate the Toolkit once and reuse it throughout your application.
+ * On instantiation, the Toolkit creates a thread pool that's used for processing all the functions.
+ * You can limit the number of pool threads used by the Toolkit via the constructor. The pool
+ * threads are destroyed once the Toolkit is destroyed, after any pending work is done.
+ *
+ * This library is thread safe. You can call methods from different pool threads. The functions will
+ * execute sequentially.
+ *
+ * A Java/Kotlin Toolkit is available. It calls this library through JNI.
+ *
+ * This toolkit can be used as a replacement for most RenderScript Intrinsic functions. Compared
+ * to RenderScript, it's simpler to use and more than twice as fast on the CPU. However RenderScript
+ * Intrinsics allow more flexibility for the type of allocation supported. In particular, this
+ * toolkit does not support allocations of floats.
+ */
+class RenderScriptToolkit {
+ /** Each Toolkit method call is converted to a Task. The processor owns the thread pool. It
+ * tiles the tasks and schedule them over the pool threads.
+ */
+ std::unique_ptr<TaskProcessor> processor;
+
+ public:
+ /**
+ * Creates the pool threads that are used for processing the method calls.
+ */
+ RenderScriptToolkit(int numberOfThreads = 0);
+ /**
+ * Destroys the thread pool. This stops any in-progress work; the Toolkit methods called from
+ * other pool threads will return without having completed the work. Because of the undefined
+ * state of the output buffers, an application should avoid destroying the Toolkit if other pool
+ * threads are executing Toolkit methods.
+ */
+ ~RenderScriptToolkit();
+
+ /**
+ * Determines how a source buffer is blended into a destination buffer.
+ *
+ * See {@link RenderScriptToolkit::blend}.
+ *
+ * blend only works on 4 byte RGBA data. In the descriptions below, ".a" represents
+ * the alpha channel.
+ */
+ enum class BlendingMode {
+ /**
+ * dest = 0
+ *
+ * The destination is cleared, i.e. each pixel is set to (0, 0, 0, 0)
+ */
+ CLEAR = 0,
+ /**
+ * dest = src
+ *
+ * Sets each pixel of the destination to the corresponding one in the source.
+ */
+ SRC = 1,
+ /**
+ * dest = dest
+ *
+ * Leaves the destination untouched. This is a no-op.
+ */
+ DST = 2,
+ /**
+ * dest = src + dest * (1.0 - src.a)
+ */
+ SRC_OVER = 3,
+ /**
+ * dest = dest + src * (1.0 - dest.a)
+ */
+ DST_OVER = 4,
+ /**
+ * dest = src * dest.a
+ */
+ SRC_IN = 5,
+ /**
+ * dest = dest * src.a
+ */
+ DST_IN = 6,
+ /**
+ * dest = src * (1.0 - dest.a)
+ */
+ SRC_OUT = 7,
+ /**
+ * dest = dest * (1.0 - src.a)
+ */
+ DST_OUT = 8,
+ /**
+ * dest.rgb = src.rgb * dest.a + (1.0 - src.a) * dest.rgb, dest.a = dest.a
+ */
+ SRC_ATOP = 9,
+ /**
+ * dest = dest.rgb * src.a + (1.0 - dest.a) * src.rgb, dest.a = src.a
+ */
+ DST_ATOP = 10,
+ /**
+ * dest = {src.r ^ dest.r, src.g ^ dest.g, src.b ^ dest.b, src.a ^ dest.a}
+ *
+ * Note: this is NOT the Porter/Duff XOR mode; this is a bitwise xor.
+ */
+ XOR = 11,
+ /**
+ * dest = src * dest
+ */
+ MULTIPLY = 12,
+ /**
+ * dest = min(src + dest, 1.0)
+ */
+ ADD = 13,
+ /**
+ * dest = max(dest - src, 0.0)
+ */
+ SUBTRACT = 14
+ };
+
+ /**
+ * Blend a source buffer with the destination buffer.
+ *
+ * Blends a source buffer and a destination buffer, placing the result in the destination
+ * buffer. The blending is done pairwise between two corresponding RGBA values found in
+ * each buffer. The mode parameter specifies one of fifteen blending operations.
+ * See {@link BlendingMode}.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The source and destination buffers must have the same dimensions. Both buffers should be
+ * large enough for sizeX * sizeY * 4 bytes. The buffers have a row-major layout.
+ *
+ * @param mode The specific blending operation to do.
+ * @param source The RGBA input buffer.
+ * @param dest The destination buffer. Used for input and output.
+ * @param sizeX The width of both buffers, as a number of RGBA values.
+ * @param sizeY The height of both buffers, as a number of RGBA values.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ void blend(BlendingMode mode, const uint8_t* _Nonnull source, uint8_t* _Nonnull dst,
+ size_t sizeX, size_t sizeY, const Restriction* _Nullable restriction = nullptr);
+
+ /**
+ * Blur an image.
+ *
+ * Performs a Gaussian blur of the input image and stores the result in the out buffer.
+ *
+ * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
+ * accepts values between 1 and 25. Larger values create a more blurred effect but also
+ * take longer to compute. When the radius extends past the edge, the edge pixel will
+ * be used as replacement for the pixel that's out off boundary.
+ *
+ * Each input pixel can either be represented by four bytes (RGBA format) or one byte
+ * for the less common blurring of alpha channel only image.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The input and output buffers must have the same dimensions. Both buffers should be
+ * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+ *
+ * @param in The buffer of the image to be blurred.
+ * @param out The buffer that receives the blurred image.
+ * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+ * @param vectorSize Either 1 or 4, the number of bytes in each cell, i.e. A vs. RGBA.
+ * @param radius The radius of the pixels used to blur.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ void blur(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+ size_t vectorSize, int radius, const Restriction* _Nullable restriction = nullptr);
+
+ /**
+ * Identity matrix that can be passed to the {@link RenderScriptToolkit::colorMatrix} method.
+ *
+ * Using this matrix will result in no change to the pixel through multiplication although
+ * the pixel value can still be modified by the add vector, or transformed to a different
+ * format.
+ */
+ static constexpr float kIdentityMatrix[] = {
+ 1.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 1.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 1.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 1.0f
+ };
+
+ /**
+ * Matrix to turn color pixels to a grey scale.
+ *
+ * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert an
+ * image from color to greyscale.
+ */
+ static constexpr float kGreyScaleColorMatrix[] = {
+ 0.299f, 0.299f, 0.299f, 0.0f,
+ 0.587f, 0.587f, 0.587f, 0.0f,
+ 0.114f, 0.114f, 0.114f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 1.0f
+ };
+
+ /**
+ * Matrix to convert RGB to YUV.
+ *
+ * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+ * first three bytes of each pixel from RGB to YUV. This leaves the last byte (the alpha
+ * channel) untouched.
+ *
+ * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+ * by this method.
+ */
+ static constexpr float kRgbToYuvMatrix[] = {
+ 0.299f, -0.14713f, 0.615f, 0.0f,
+ 0.587f, -0.28886f, -0.51499f, 0.0f,
+ 0.114f, 0.436f, -0.10001f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 1.0f
+ };
+
+ /**
+ * Matrix to convert YUV to RGB.
+ *
+ * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+ * first three bytes of each pixel from YUV to RGB. This leaves the last byte (the alpha
+ * channel) untouched.
+ *
+ * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+ * by this method. Use {@link RenderScriptToolkit::yuvToRgb} to convert these buffers.
+ */
+ static constexpr float kYuvToRgbMatrix[] = {
+ 1.0f, 1.0f, 1.0f, 0.0f,
+ 0.0f, -0.39465f, 2.03211f, 0.0f,
+ 1.13983f, -0.5806f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 1.0f
+ };
+
+ /**
+ * Transform an image using a color matrix.
+ *
+ * Converts a 2D array of vectors of unsigned bytes, multiplying each vectors by a 4x4 matrix
+ * and adding an optional vector.
+ *
+ * Each input vector is composed of 1-4 unsigned bytes. If less than 4 bytes, it's extended to
+ * 4, padding with zeroes. The unsigned bytes are converted from 0-255 to 0.0-1.0 floats
+ * before the multiplication is done.
+ *
+ * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
+ * If the output vector size is less than four, the unused channels are discarded.
+ *
+ * If addVector is null, a vector of zeroes is added, i.e. a noop.
+ *
+ * Check kIdentityMatrix, kGreyScaleColorMatrix, kRgbToYuvMatrix, and kYuvToRgbMatrix for sample
+ * matrices. The YUV conversion may not work for all color spaces.
+ *
+ * @param in The buffer of the image to be converted.
+ * @param out The buffer that receives the converted image.
+ * @param inputVectorSize The number of bytes in each input cell, a value from 1 to 4.
+ * @param outputVectorSize The number of bytes in each output cell, a value from 1 to 4.
+ * @param sizeX The width of both buffers, as a number of 1 to 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 1 to 4 byte cells.
+ * @param matrix The 4x4 matrix to multiply, in row major format.
+ * @param addVector A vector of four floats that's added to the result of the multiplication.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ void colorMatrix(const void* _Nonnull in, void* _Nonnull out, size_t inputVectorSize,
+ size_t outputVectorSize, size_t sizeX, size_t sizeY,
+ const float* _Nonnull matrix, const float* _Nullable addVector = nullptr,
+ const Restriction* _Nullable restriction = nullptr);
+
+ /**
+ * Convolve a ByteArray.
+ *
+ * Applies a 3x3 or 5x5 convolution to the input array using the provided coefficients.
+ *
+ * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
+ * The coefficients should be provided in row-major format.
+ *
+ * When the square extends past the edge, the edge values will be used as replacement for the
+ * values that's are off boundary.
+ *
+ * Each input cell can either be represented by one to four bytes. Each byte is multiplied
+ * and accumulated independently of the other bytes of the cell.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The input and output buffers must have the same dimensions. Both buffers should be
+ * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+ *
+ * @param in The buffer of the image to be blurred.
+ * @param out The buffer that receives the blurred image.
+ * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+ * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+ * @param coefficients 9 or 25 multipliers.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ void convolve3x3(const void* _Nonnull in, void* _Nonnull out, size_t vectorSize, size_t sizeX,
+ size_t sizeY, const float* _Nonnull coefficients,
+ const Restriction* _Nullable restriction = nullptr);
+
+ void convolve5x5(const void* _Nonnull in, void* _Nonnull out, size_t vectorSize, size_t sizeX,
+ size_t sizeY, const float* _Nonnull coefficients,
+ const Restriction* _Nullable restriction = nullptr);
+
+ /**
+ * Compute the histogram of an image.
+ *
+ * Tallies how many times each of the 256 possible values of a byte is found in the input.
+ *
+ * An input cell can be represented by one to four bytes. The tally is done independently
+ * for each of the bytes of the cell. Correspondingly, the out array will have
+ * 256 * vectorSize entries. The counts for value 0 are consecutive, followed by those for
+ * value 1, etc.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The source buffers should be large enough for sizeX * sizeY * vectorSize bytes. The buffers
+ * have a row-major layout. The out buffer should be large enough for 256 * vectorSize ints.
+ *
+ * @param in The buffer of the image to be analyzed.
+ * @param out The resulting vector of counts.
+ * @param sizeX The width of the input buffers, as a number of 1 or 4 byte cells.
+ * @param sizeY The height of the input buffers, as a number of 1 or 4 byte cells.
+ * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ void histogram(const uint8_t* _Nonnull in, int32_t* _Nonnull out, size_t sizeX, size_t sizeY,
+ size_t vectorSize, const Restriction* _Nullable restriction = nullptr);
+
+ /**
+ * Compute the histogram of the dot product of an image.
+ *
+ * This method supports cells of 1 to 4 bytes in length. For each cell of the array,
+ * the dot product of its bytes with the provided coefficients is computed. The resulting
+ * floating point value is converted to an unsigned byte and tallied in the histogram.
+ *
+ * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
+ * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
+ *
+ * Each coefficients must be >= 0 and their sum must be 1.0 or less. There must be the same
+ * number of coefficients as vectorSize.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The source buffers should be large enough for sizeX * sizeY * vectorSize bytes. The buffers
+ * have a row-major layout. The out array should be large enough for 256 ints.
+ *
+ * @param in The buffer of the image to be analyzed.
+ * @param out The resulting vector of counts.
+ * @param sizeX The width of the input buffers, as a number of 1 or 4 byte cells.
+ * @param sizeY The height of the input buffers, as a number of 1 or 4 byte cells.
+ * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+ * @param coefficients The values used for the dot product. Can be nullptr.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ void histogramDot(const uint8_t* _Nonnull in, int32_t* _Nonnull out, size_t sizeX, size_t sizeY,
+ size_t vectorSize, const float* _Nullable coefficients,
+ const Restriction* _Nullable restriction = nullptr);
+
+ /**
+ * Transform an image using a look up table
+ *
+ * Transforms an image by using a per-channel lookup table. Each channel of the input has an
+ * independent lookup table. The tables are 256 entries in size and can cover the full value
+ * range of a byte.
+ *
+ * The input array should be in RGBA format, where four consecutive bytes form an cell.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The input and output buffers must have the same dimensions. Both buffers should be
+ * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+ *
+ * @param in The buffer of the image to be transformed.
+ * @param out The buffer that receives the transformed image.
+ * @param sizeX The width of both buffers, as a number of 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 4 byte cells.
+ * @param red An array of 256 values that's used to convert the R channel.
+ * @param green An array of 256 values that's used to convert the G channel.
+ * @param blue An array of 256 values that's used to convert the B channel.
+ * @param alpha An array of 256 values that's used to convert the A channel.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ void lut(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+ const uint8_t* _Nonnull red, const uint8_t* _Nonnull green,
+ const uint8_t* _Nonnull blue, const uint8_t* _Nonnull alpha,
+ const Restriction* _Nullable restriction = nullptr);
+
+ /**
+ * Transform an image using a 3D look up table
+ *
+ * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
+ * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
+ * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
+ * is stored in the output.
+ *
+ * The input array should be in RGBA format, where four consecutive bytes form an cell.
+ * The fourth byte of each input cell is ignored.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The input and output buffers must have the same dimensions. Both buffers should be
+ * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+ *
+ * @param in The buffer of the image to be transformed.
+ * @param out The buffer that receives the transformed image.
+ * @param sizeX The width of both buffers, as a number of 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 4 byte cells.
+ * @param cube The translation cube, in row major-format.
+ * @param cubeSizeX The number of RGBA entries in the cube in the X direction.
+ * @param cubeSizeY The number of RGBA entries in the cube in the Y direction.
+ * @param cubeSizeZ The number of RGBA entries in the cube in the Z direction.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ void lut3d(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+ const uint8_t* _Nonnull cube, size_t cubeSizeX, size_t cubeSizeY, size_t cubeSizeZ,
+ const Restriction* _Nullable restriction = nullptr);
+
+ /**
+ * Resize an image.
+ *
+ * Resizes an image using bicubic interpolation.
+ *
+ * This method supports cells of 1 to 4 bytes in length. Each byte of the cell is
+ * interpolated independently from the others.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of the output buffer. The corresponding scaled range of the input will be used. If provided,
+ * the range must be wholly contained with the dimensions described by outputSizeX and
+ * outputSizeY.
+ *
+ * The input and output buffers have a row-major layout. Both buffers should be
+ * large enough for sizeX * sizeY * vectorSize bytes.
+ *
+ * @param in The buffer of the image to be resized.
+ * @param out The buffer that receives the resized image.
+ * @param inputSizeX The width of the input buffer, as a number of 1-4 byte cells.
+ * @param inputSizeY The height of the input buffer, as a number of 1-4 byte cells.
+ * @param vectorSize The number of bytes in each cell of both buffers. A value from 1 to 4.
+ * @param outputSizeX The width of the output buffer, as a number of 1-4 byte cells.
+ * @param outputSizeY The height of the output buffer, as a number of 1-4 byte cells.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ void resize(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t inputSizeX,
+ size_t inputSizeY, size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
+ const Restriction* _Nullable restriction = nullptr);
+
+ /**
+ * The YUV formats supported by yuvToRgb.
+ */
+ enum class YuvFormat {
+ NV21 = 0x11,
+ YV12 = 0x32315659,
+ };
+
+ /**
+ * Convert an image from YUV to RGB.
+ *
+ * Converts an Android YUV buffer to RGB. The input allocation should be
+ * supplied in a supported YUV format as a YUV cell Allocation.
+ * The output is RGBA; the alpha channel will be set to 255.
+ *
+ * Note that for YV12 and a sizeX that's not a multiple of 32, the
+ * RenderScript Intrinsic may not have converted the image correctly.
+ * This Toolkit method should.
+ *
+ * @param in The buffer of the image to be converted.
+ * @param out The buffer that receives the converted image.
+ * @param sizeX The width in pixels of the image. Must be even.
+ * @param sizeY The height in pixels of the image.
+ * @param format Either YV12 or NV21.
+ */
+ void yuvToRgb(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+ YuvFormat format);
+};
+
+} // namespace renderscript
+} // namespace android
+
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
diff --git a/toolkit/Resize.cpp b/toolkit/Resize.cpp
new file mode 100644
index 0000000..624ae8e
--- /dev/null
+++ b/toolkit/Resize.cpp
@@ -0,0 +1,769 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <math.h>
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#if defined(ARCH_X86_HAVE_AVX2)
+#include <stdint.h>
+#include <x86intrin.h>
+#include <xmmintrin.h>
+#endif
+
+#define LOG_TAG "renderscript.toolkit.Resize"
+
+namespace android {
+namespace renderscript {
+
+class ResizeTask : public Task {
+ const uchar* mIn;
+ uchar* mOut;
+ float mScaleX;
+ float mScaleY;
+ size_t mInputSizeX;
+ size_t mInputSizeY;
+
+ void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+ void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+ void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+ void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+ void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+ void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ public:
+ ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY,
+ size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
+ const Restriction* restriction)
+ : Task{outputSizeX, outputSizeY, vectorSize, false, restriction},
+ mIn{input},
+ mOut{output},
+ mInputSizeX{inputSizeX},
+ mInputSizeY{inputSizeY} {
+ mScaleX = static_cast<float>(inputSizeX) / outputSizeX;
+ mScaleY = static_cast<float>(inputSizeY) / outputSizeY;
+ }
+};
+
+void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t);
+
+ KernelFunction kernel;
+ switch (mVectorSize) {
+ case 4:
+ kernel = &ResizeTask::kernelU4;
+ break;
+ case 3:
+ kernel = &ResizeTask::kernelU4;
+ break;
+ case 2:
+ kernel = &ResizeTask::kernelU2;
+ break;
+ case 1:
+ kernel = &ResizeTask::kernelU1;
+ break;
+ default:
+ ALOGE("Bad vector size %zd", mVectorSize);
+ }
+
+ for (size_t y = startY; y < endY; y++) {
+ size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize);
+ uchar* out = mOut + offset;
+ std::invoke(kernel, this, out, startX, endX, y);
+ }
+}
+
+static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) {
+ return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+ + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+
+static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
+ return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+ + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
+ return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
+ _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
+ + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),
+ _mm_set1_ps(p3 - p0))))));
+
+}
+#else
+static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
+ //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x);
+ return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+ + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+#endif
+
+static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
+ float xf, float yf, int width) {
+ int startx = (int) floor(xf - 1);
+ xf = xf - floor(xf);
+ int maxx = width - 1;
+ int xs0 = std::max(0, startx + 0);
+ int xs1 = std::max(0, startx + 1);
+ int xs2 = std::min(maxx, startx + 2);
+ int xs3 = std::min(maxx, startx + 3);
+
+ float4 p0 = cubicInterpolate(convert<float4>(yp0[xs0]),
+ convert<float4>(yp0[xs1]),
+ convert<float4>(yp0[xs2]),
+ convert<float4>(yp0[xs3]), xf);
+
+ float4 p1 = cubicInterpolate(convert<float4>(yp1[xs0]),
+ convert<float4>(yp1[xs1]),
+ convert<float4>(yp1[xs2]),
+ convert<float4>(yp1[xs3]), xf);
+
+ float4 p2 = cubicInterpolate(convert<float4>(yp2[xs0]),
+ convert<float4>(yp2[xs1]),
+ convert<float4>(yp2[xs2]),
+ convert<float4>(yp2[xs3]), xf);
+
+ float4 p3 = cubicInterpolate(convert<float4>(yp3[xs0]),
+ convert<float4>(yp3[xs1]),
+ convert<float4>(yp3[xs2]),
+ convert<float4>(yp3[xs3]), xf);
+
+ float4 p = cubicInterpolate(p0, p1, p2, p3, yf);
+ p = clamp(p + 0.5f, 0.f, 255.f);
+ return convert<uchar4>(p);
+}
+
+static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
+ float xf, float yf, int width) {
+ int startx = (int) floor(xf - 1);
+ xf = xf - floor(xf);
+ int maxx = width - 1;
+ int xs0 = std::max(0, startx + 0);
+ int xs1 = std::max(0, startx + 1);
+ int xs2 = std::min(maxx, startx + 2);
+ int xs3 = std::min(maxx, startx + 3);
+
+ float2 p0 = cubicInterpolate(convert<float2>(yp0[xs0]),
+ convert<float2>(yp0[xs1]),
+ convert<float2>(yp0[xs2]),
+ convert<float2>(yp0[xs3]), xf);
+
+ float2 p1 = cubicInterpolate(convert<float2>(yp1[xs0]),
+ convert<float2>(yp1[xs1]),
+ convert<float2>(yp1[xs2]),
+ convert<float2>(yp1[xs3]), xf);
+
+ float2 p2 = cubicInterpolate(convert<float2>(yp2[xs0]),
+ convert<float2>(yp2[xs1]),
+ convert<float2>(yp2[xs2]),
+ convert<float2>(yp2[xs3]), xf);
+
+ float2 p3 = cubicInterpolate(convert<float2>(yp3[xs0]),
+ convert<float2>(yp3[xs1]),
+ convert<float2>(yp3[xs2]),
+ convert<float2>(yp3[xs3]), xf);
+
+ float2 p = cubicInterpolate(p0, p1, p2, p3, yf);
+ p = clamp(p + 0.5f, 0.f, 255.f);
+ return convert<uchar2>(p);
+}
+
+static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
+ float xf, float yf, int width) {
+ int startx = (int) floor(xf - 1);
+ xf = xf - floor(xf);
+ int maxx = width - 1;
+ int xs0 = std::max(0, startx + 0);
+ int xs1 = std::max(0, startx + 1);
+ int xs2 = std::min(maxx, startx + 2);
+ int xs3 = std::min(maxx, startx + 3);
+
+ float p0 = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
+ (float)yp0[xs2], (float)yp0[xs3], xf);
+ float p1 = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
+ (float)yp1[xs2], (float)yp1[xs3], xf);
+ float p2 = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
+ (float)yp2[xs2], (float)yp2[xs3], xf);
+ float p3 = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
+ (float)yp3[xs2], (float)yp3[xs3], xf);
+
+ float p = cubicInterpolate(p0, p1, p2, p3, yf);
+ p = clamp(p + 0.5f, 0.f, 255.f);
+ //ALOGI("CUC,%f,%u", p, (uchar)p);
+ return (uchar)p;
+}
+
+extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
+
+extern "C" void rsdIntrinsicResizeB4_K(
+ uchar4 *dst,
+ size_t count,
+ uint32_t xf,
+ uint32_t xinc,
+ uchar4 const *srcn,
+ uchar4 const *src0,
+ uchar4 const *src1,
+ uchar4 const *src2,
+ size_t xclip,
+ size_t avail,
+ uint64_t osc_ctl,
+ int32_t const *yr);
+
+extern "C" void rsdIntrinsicResizeB2_K(
+ uchar2 *dst,
+ size_t count,
+ uint32_t xf,
+ uint32_t xinc,
+ uchar2 const *srcn,
+ uchar2 const *src0,
+ uchar2 const *src1,
+ uchar2 const *src2,
+ size_t xclip,
+ size_t avail,
+ uint64_t osc_ctl,
+ int32_t const *yr);
+
+extern "C" void rsdIntrinsicResizeB1_K(
+ uchar *dst,
+ size_t count,
+ uint32_t xf,
+ uint32_t xinc,
+ uchar const *srcn,
+ uchar const *src0,
+ uchar const *src1,
+ uchar const *src2,
+ size_t xclip,
+ size_t avail,
+ uint64_t osc_ctl,
+ int32_t const *yr);
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+static void mkYCoeff(int32_t *yr, float yf) {
+ int32_t yf1 = rint(yf * 0x10000);
+ int32_t yf2 = rint(yf * yf * 0x10000);
+ int32_t yf3 = rint(yf * yf * yf * 0x10000);
+
+ yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
+ yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
+ yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
+ yr[3] = -(yf3 - yf2) >> 1;
+}
+#endif
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
+ float xf, float yf, int width) {
+ int startx = (int) floor(xf - 1);
+ xf = xf - floor(xf);
+ int maxx = width - 1;
+ int xs0 = std::max(0, startx + 0);
+ int xs1 = std::max(0, startx + 1);
+ int xs2 = std::min(maxx, startx + 2);
+ int xs3 = std::min(maxx, startx + 3);
+
+ float4 p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
+ yp0[xs2], yp0[xs3], xf);
+ float4 p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
+ yp1[xs2], yp1[xs3], xf);
+ float4 p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
+ yp2[xs2], yp2[xs3], xf);
+ float4 p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
+ yp3[xs2], yp3[xs3], xf);
+
+ float4 p = cubicInterpolate(p0, p1, p2, p3, yf);
+ return p;
+}
+
+static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
+ float xf, float yf, int width) {
+ int startx = (int) floor(xf - 1);
+ xf = xf - floor(xf);
+ int maxx = width - 1;
+ int xs0 = std::max(0, startx + 0);
+ int xs1 = std::max(0, startx + 1);
+ int xs2 = std::min(maxx, startx + 2);
+ int xs3 = std::min(maxx, startx + 3);
+
+ float2 p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
+ yp0[xs2], yp0[xs3], xf);
+ float2 p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
+ yp1[xs2], yp1[xs3], xf);
+ float2 p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
+ yp2[xs2], yp2[xs3], xf);
+ float2 p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
+ yp3[xs2], yp3[xs3], xf);
+
+ float2 p = cubicInterpolate(p0, p1, p2, p3, yf);
+ return p;
+}
+
+static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
+ float xf, float yf, int width) {
+ int startx = (int) floor(xf - 1);
+ xf = xf - floor(xf);
+ int maxx = width - 1;
+ int xs0 = std::max(0, startx + 0);
+ int xs1 = std::max(0, startx + 1);
+ int xs2 = std::min(maxx, startx + 2);
+ int xs3 = std::min(maxx, startx + 3);
+
+ float p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
+ yp0[xs2], yp0[xs3], xf);
+ float p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
+ yp1[xs2], yp1[xs3], xf);
+ float p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
+ yp2[xs2], yp2[xs3], xf);
+ float p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
+ yp3[xs2], yp3[xs3], xf);
+
+ float p = cubicInterpolate(p0, p1, p2, p3, yf);
+ return p;
+}
+#endif
+
+void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ const uchar *pin = mIn;
+ const int srcHeight = mInputSizeY;
+ const int srcWidth = mInputSizeX;
+ const size_t stride = mInputSizeX * paddedSize(mVectorSize);
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
+ _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+ float yf = (currentY + 0.5f) * mScaleY - 0.5f;
+#endif
+
+
+ int starty = (int) floor(yf - 1);
+ yf = yf - floor(yf);
+ int maxy = srcHeight - 1;
+ int ys0 = std::max(0, starty + 0);
+ int ys1 = std::max(0, starty + 1);
+ int ys2 = std::min(maxy, starty + 2);
+ int ys3 = std::min(maxy, starty + 3);
+
+ const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
+ const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
+ const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
+ const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
+
+ uchar4 *out = ((uchar4 *)outPtr);
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+ if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
+ float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+ long xf16 = rint(xf * 0x10000);
+ uint32_t xinc16 = rint(mScaleX * 0x10000);
+
+ int xoff = (xf16 >> 16) - 1;
+ int xclip = std::max(0, xoff) - xoff;
+ int len = x2 - x1;
+
+ int32_t yr[4];
+ uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
+ mkYCoeff(yr, yf);
+
+ xoff += xclip;
+
+ rsdIntrinsicResizeB4_K(
+ out, len,
+ xf16 & 0xffff, xinc16,
+ yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
+ xclip, srcWidth - xoff + xclip,
+ osc_ctl, yr);
+ out += len;
+ x1 += len;
+ }
+#endif
+
+ while(x1 < x2) {
+#if defined(ARCH_X86_HAVE_AVX2)
+ float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+ _mm_set1_ps(0.5f)));
+#else
+ float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+#endif
+ *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+ out++;
+ x1++;
+ }
+}
+
+void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ const uchar *pin = mIn;
+ const int srcHeight = mInputSizeY;
+ const int srcWidth = mInputSizeX;
+ const size_t stride = mInputSizeX * mVectorSize;
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float yf = _mm_cvtss_f32(
+ _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+ float yf = (currentY + 0.5f) * mScaleY - 0.5f;
+#endif
+
+ int starty = (int) floor(yf - 1);
+ yf = yf - floor(yf);
+ int maxy = srcHeight - 1;
+ int ys0 = std::max(0, starty + 0);
+ int ys1 = std::max(0, starty + 1);
+ int ys2 = std::min(maxy, starty + 2);
+ int ys3 = std::min(maxy, starty + 3);
+
+ const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
+ const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
+ const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
+ const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
+
+ uchar2 *out = ((uchar2 *)outPtr);
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+ if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
+ float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+ long xf16 = rint(xf * 0x10000);
+ uint32_t xinc16 = rint(mScaleX * 0x10000);
+
+ int xoff = (xf16 >> 16) - 1;
+ int xclip = std::max(0, xoff) - xoff;
+ int len = x2 - x1;
+
+ int32_t yr[4];
+ uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
+ mkYCoeff(yr, yf);
+
+ xoff += xclip;
+
+ rsdIntrinsicResizeB2_K(
+ out, len,
+ xf16 & 0xffff, xinc16,
+ yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
+ xclip, srcWidth - xoff + xclip,
+ osc_ctl, yr);
+ out += len;
+ x1 += len;
+ }
+#endif
+
+ while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+ _mm_set1_ps(0.5f)));
+#else
+ float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+#endif
+ *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+ out++;
+ x1++;
+ }
+}
+
+void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
+ const uchar *pin = mIn;
+ const int srcHeight = mInputSizeY;
+ const int srcWidth = mInputSizeX;
+ const size_t stride = mInputSizeX * mVectorSize;
+
+ // ALOGI("Toolkit ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
+ // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float yf = _mm_cvtss_f32(
+ _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+ float yf = (currentY + 0.5f) * mScaleY - 0.5f;
+#endif
+
+ int starty = (int) floor(yf - 1);
+ yf = yf - floor(yf);
+ int maxy = srcHeight - 1;
+ int ys0 = std::max(0, starty + 0);
+ int ys1 = std::min(maxy, std::max(0, starty + 1));
+ int ys2 = std::min(maxy, starty + 2);
+ int ys3 = std::min(maxy, starty + 3);
+
+ const uchar *yp0 = pin + stride * ys0;
+ const uchar *yp1 = pin + stride * ys1;
+ const uchar *yp2 = pin + stride * ys2;
+ const uchar *yp3 = pin + stride * ys3;
+
+ uchar *out = ((uchar *)outPtr);
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+ if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
+ float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+ long xf16 = rint(xf * 0x10000);
+ uint32_t xinc16 = rint(mScaleX * 0x10000);
+
+ int xoff = (xf16 >> 16) - 1;
+ int xclip = std::max(0, xoff) - xoff;
+ int len = x2 - x1;
+
+ int32_t yr[4];
+ uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
+ mkYCoeff(yr, yf);
+
+ // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
+ // xclip %d, len %d, osc_ctl %lu)",
+ // ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
+ // osc_ctl);
+ // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
+ // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
+ // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);
+
+ xoff += xclip;
+
+ rsdIntrinsicResizeB1_K(
+ out, len,
+ xf16 & 0xffff, xinc16,
+ yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
+ xclip, srcWidth - xoff + xclip,
+ osc_ctl, yr);
+ out += len;
+ x1 += len;
+ }
+#endif
+
+ while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+ _mm_set1_ps(0.5f)));
+#else
+ float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+#endif
+
+ *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+ out++;
+ x1++;
+ }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ const uchar *pin = mIn;
+ const int srcHeight = inputSizeY;
+ const int srcWidth = inputSizeX;
+ const size_t stride = sizeX * vectorSize;
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float yf = _mm_cvtss_f32(
+ _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+ float yf = (currentY + 0.5f) * scaleY - 0.5f;
+#endif
+
+ int starty = (int) floor(yf - 1);
+ yf = yf - floor(yf);
+ int maxy = srcHeight - 1;
+ int ys0 = std::max(0, starty + 0);
+ int ys1 = std::max(0, starty + 1);
+ int ys2 = std::min(maxy, starty + 2);
+ int ys3 = std::min(maxy, starty + 3);
+
+ const float4 *yp0 = (const float4 *)(pin + stride * ys0);
+ const float4 *yp1 = (const float4 *)(pin + stride * ys1);
+ const float4 *yp2 = (const float4 *)(pin + stride * ys2);
+ const float4 *yp3 = (const float4 *)(pin + stride * ys3);
+
+ float4 *out = ((float4 *)outPtr);
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+ while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+ _mm_set1_ps(0.5f)));
+#else
+ float xf = (x1 + 0.5f) * scaleX - 0.5f;
+#endif
+
+ *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+ out++;
+ x1++;
+ }
+}
+
+void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ const uchar *pin = mIn;
+ const int srcHeight = inputSizeY;
+ const int srcWidth = inputSizeX;
+ const size_t stride = sizeX * vectorSize;
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
+ _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+ float yf = (currentY + 0.5f) * scaleY - 0.5f;
+#endif
+
+ int starty = (int) floor(yf - 1);
+ yf = yf - floor(yf);
+ int maxy = srcHeight - 1;
+ int ys0 = std::max(0, starty + 0);
+ int ys1 = std::max(0, starty + 1);
+ int ys2 = std::min(maxy, starty + 2);
+ int ys3 = std::min(maxy, starty + 3);
+
+ const float2 *yp0 = (const float2 *)(pin + stride * ys0);
+ const float2 *yp1 = (const float2 *)(pin + stride * ys1);
+ const float2 *yp2 = (const float2 *)(pin + stride * ys2);
+ const float2 *yp3 = (const float2 *)(pin + stride * ys3);
+
+ float2 *out = ((float2 *)outPtr);
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+ while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+ _mm_set1_ps(0.5f)));
+#else
+ float xf = (x1 + 0.5f) * scaleX - 0.5f;
+#endif
+
+ *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+ out++;
+ x1++;
+ }
+}
+
+void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ const uchar *pin = mIn;
+ const int srcHeight = inputSizeY;
+ const int srcWidth = inputSizeX;
+ const size_t stride = sizeX * vectorSize;
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
+ _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+ float yf = (currentY + 0.5f) * scaleY - 0.5f;
+#endif
+
+ int starty = (int) floor(yf - 1);
+ yf = yf - floor(yf);
+ int maxy = srcHeight - 1;
+ int ys0 = std::max(0, starty + 0);
+ int ys1 = std::max(0, starty + 1);
+ int ys2 = std::min(maxy, starty + 2);
+ int ys3 = std::min(maxy, starty + 3);
+
+ const float *yp0 = (const float *)(pin + stride * ys0);
+ const float *yp1 = (const float *)(pin + stride * ys1);
+ const float *yp2 = (const float *)(pin + stride * ys2);
+ const float *yp3 = (const float *)(pin + stride * ys3);
+
+ float *out = ((float *)outPtr);
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+ while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+ float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+ _mm_set1_ps(0.5f)));
+#else
+ float xf = (x1 + 0.5f) * scaleX - 0.5f;
+#endif
+
+ *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+ out++;
+ x1++;
+ }
+}
+
+void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc)
+{
+
+ //check the data type to determine F or U.
+ if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
+ switch(mAlloc->getType()->getElement()->getVectorSize()) {
+ case 1:
+ mRootPtr = &kernelU1;
+ break;
+ case 2:
+ mRootPtr = &kernelU2;
+ break;
+ case 3:
+ case 4:
+ mRootPtr = &kernelU4;
+ break;
+ }
+ } else {
+ switch(mAlloc->getType()->getElement()->getVectorSize()) {
+ case 1:
+ mRootPtr = &kernelF1;
+ break;
+ case 2:
+ mRootPtr = &kernelF2;
+ break;
+ case 3:
+ case 4:
+ mRootPtr = &kernelF4;
+ break;
+ }
+ }
+}
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX,
+ size_t inputSizeY, size_t vectorSize, size_t outputSizeX,
+ size_t outputSizeY, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+ if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) {
+ return;
+ }
+ if (vectorSize < 1 || vectorSize > 4) {
+ ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+ return;
+ }
+#endif
+
+ ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize,
+ outputSizeX, outputSizeY, restriction);
+ processor->doTask(&task);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/Resize_advsimd.S b/toolkit/Resize_advsimd.S
new file mode 100644
index 0000000..59e735c
--- /dev/null
+++ b/toolkit/Resize_advsimd.S
@@ -0,0 +1,754 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
+ * integer (bicubic has a little overshoot). It would also be possible to add
+ * a temporary DC bias to eliminate the sign bit for more precision, but that's
+ * extra arithmetic.
+ */
+.set VERTBITS, 14
+
+/* The size of the scratch buffer in which we store our vertically convolved
+ * intermediates.
+ */
+.set CHUNKSHIFT, 7 /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
+.set CHUNKSIZE, (1 << CHUNKSHIFT)
+
+/* The number of components processed in a single iteration of the innermost
+ * loop.
+ */
+.set VECSHIFT, 3
+.set VECSIZE, (1<<VECSHIFT)
+
+/* Read four different lines (except at edges where addresses may be clamped,
+ * which is why we don't simply take base and stride registers), and multiply
+ * and accumulate them by the coefficients in v3[0..3], leaving the results in
+ * v12. This gives eight 16-bit results representing a horizontal line of 2-8
+ * input pixels (depending on number of components per pixel) to be fed into
+ * the horizontal scaling pass.
+ *
+ * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
+ * known to represent negative values and VMLS is used to implement this).
+ * Output is VERTBITS signed fixed-point, which must leave room for a little
+ * v12. This gives eight 16-bit results.
+ */
+.macro vert8, dstlo=v12.4h, dsthi=v12.8h
+ ld1 {v8.8b}, [x4], #8
+ ld1 {v9.8b}, [x5], #8
+ ld1 {v10.8b}, [x6], #8
+ ld1 {v11.8b}, [x7], #8
+ uxtl v8.8h, v8.8b
+ uxtl v9.8h, v9.8b
+ uxtl v10.8h, v10.8b
+ uxtl v11.8h, v11.8b
+ umull v12.4s, v9.4h, v3.h[1]
+ umull2 v13.4s, v9.8h, v3.h[1]
+ umlsl v12.4s, v8.4h, v3.h[0]
+ umlsl2 v13.4s, v8.8h, v3.h[0]
+ umlal v12.4s, v10.4h, v3.h[2]
+ umlal2 v13.4s, v10.8h, v3.h[2]
+ umlsl v12.4s, v11.4h, v3.h[3]
+ umlsl2 v13.4s, v11.8h, v3.h[3]
+
+ /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
+ * minus VERTBITS (the number of fraction bits we want to keep from
+ * here on).
+ */
+ sqshrn \dstlo, v12.4s, #8 + (16 - VERTBITS)
+ sqshrn2 \dsthi, v13.4s, #8 + (16 - VERTBITS)
+.endm
+
+/* As above, but only four 16-bit results into v12hi.
+ */
+.macro vert4, dst=v12.8h
+ ld1 {v8.s}[0], [x4], #4
+ ld1 {v9.s}[0], [x5], #4
+ ld1 {v10.s}[0], [x6], #4
+ ld1 {v11.s}[0], [x7], #4
+ uxtl v8.8h, v8.8b
+ uxtl v9.8h, v9.8b
+ uxtl v10.8h, v10.8b
+ uxtl v11.8h, v11.8b
+ umull v12.4s, v9.4h, v3.h[1]
+ umlsl v12.4s, v8.4h, v3.h[0]
+ umlal v12.4s, v10.4h, v3.h[2]
+ umlsl v12.4s, v11.4h, v3.h[3]
+.ifc \dst,v12.8h
+ sqshrn2 \dst, v12.4s, #8 + (16 - VERTBITS)
+.else
+ sqshrn \dst, v12.4s, #8 + (16 - VERTBITS)
+.endif
+.endm
+
+
+/* During horizontal resize having CHUNKSIZE input available means being able
+ * to produce a varying amount of output, depending on the phase of the data.
+ * This function calculates the minimum number of VECSIZE chunks extracted from
+ * a CHUNKSIZE window (x1), and the threshold value for when the count will be
+ * one higher than that (x0).
+ * These work out, conveniently, to be the quotient and remainder from:
+ * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
+ *
+ * The two values are packed together in a uint64_t for convenience; and
+ * they are, in fact, used this way as an arithmetic short-cut later on.
+ */
+/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
+ENTRY(rsdIntrinsicResize_oscctl_K)
+ lsl x2, x0, #VECSHIFT
+ mov x0, #(CHUNKSIZE << 16) - 1
+ add x0, x0, x2
+ udiv x1, x0, x2
+ msub x0, x1, x2, x0
+ add x0, x0, x1, LSL #32
+ ret
+END(rsdIntrinsicResize_oscctl_K)
+
+/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
+ * For the most part the vertical pass (the outer loop) is the same for all
+ * versions. Exceptions are handled in-line with conditional assembly.
+ */
+.irp comp, 1, 2, 4
+.if \comp == 1
+.set COMPONENT_SHIFT, 0
+.elseif \comp == 2
+.set COMPONENT_SHIFT, 1
+.elseif \comp == 4
+.set COMPONENT_SHIFT, 2
+.else
+.error "Unknown component count"
+.endif
+.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
+.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
+
+.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
+
+/* void rsdIntrinsicResizeB1_K(
+ * uint8_t * restrict dst, // x0
+ * size_t count, // x1
+ * uint32_t xf, // x2
+ * uint32_t xinc, // x3
+ * uint8_t const * restrict srcn, // x4
+ * uint8_t const * restrict src0, // x5
+ * uint8_t const * restrict src1, // x6
+ * uint8_t const * restrict src2, // x7
+ * size_t xclip, // [sp,#0] -> [sp,#80] -> x12
+ * size_t avail, // [sp,#8] -> [sp,#88] -> x11
+ * uint64_t osc_ctl, // [sp,#16] -> [sp,#96] -> x10
+ * int32 const *yr, // [sp,#24] -> [sp,#104] -> v4 (copied to v3 for scalar access)
+ */
+ENTRY(rsdIntrinsicResizeB\comp\()_K)
+ sub x8, sp, #48
+ sub sp, sp, #80
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x8]
+ str x19, [x8, #32]
+
+ /* align the working buffer on the stack to make it easy to use bit
+ * twiddling for address calculations.
+ */
+ sub x12, sp, #BUFFER_SIZE
+ bic x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
+
+ ldr x8, [sp,#104] // yr
+ adrp x9, intrinsic_resize_consts
+ add x9, x9, :lo12:intrinsic_resize_consts
+ ld1 {v4.4s}, [x8]
+ ld1 {v5.8h}, [x9]
+ sqxtun v4.4h, v4.4s // yr
+ dup v6.8h, w2
+ dup v7.8h, w3
+ mla v6.8h, v5.8h, v7.8h // vxf
+ shl v7.8h, v7.8h, #VECSHIFT // vxinc
+
+ /* Compute starting condition for oscillator used to compute ahead
+ * of time how many iterations are possible before needing to
+ * refill the working buffer. This is based on the fixed-point
+ * index of the last element in the vector of pixels processed in
+ * each iteration, counting up until it would overflow.
+ */
+ sub x8, x2, x3
+ lsl x9, x3, #VECSHIFT
+ add x8, x8, x9
+
+ ldr x10, [sp,#96] // osc_ctl
+ ldp x13,x11, [sp,#80] // xclip, avail
+
+ mov x19, sp
+ mov sp, x12
+
+ /* x4-x7 contain pointers to the four lines of input to be
+ * convolved. These pointers have been clamped vertically and
+ * horizontally (which is why it's not a simple row/stride pair),
+ * and the xclip argument (now in x13) indicates how many pixels
+ * from true the x position of the pointer is. This value should
+ * be 0, 1, or 2 only.
+ *
+ * Start by placing four pixels worth of input at the far end of
+ * the buffer. As many as two of these may be clipped, so four
+ * pixels are fetched, and then the first pixel is duplicated and
+ * the data shifted according to xclip. The source pointers are
+ * then also adjusted according to xclip so that subsequent fetches
+ * match.
+ */
+ mov v3.8b, v4.8b /* make y coeffs available for vert4 and vert8 macros */
+ sub x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
+ add x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
+ add x14, x14, #4 * COMPONENT_COUNT * 2
+.if \comp == 1
+ vert4 v12.4h
+ dup v11.4h, v12.h[0]
+ st1 {v11.4h,v12.4h}, [x12]
+ ld1 {v12.4h}, [x14]
+ st1 {v12.4h}, [x15]
+.elseif \comp == 2
+ vert8
+ dup v11.4s, v12.s[0]
+ st1 {v11.8h,v12.8h}, [x12]
+ ld1 {v12.8h}, [x14]
+ st1 {v12.8h}, [x15]
+.elseif \comp == 4
+ vert8 v14.4h, v14.8h
+ vert8 v15.4h, v15.8h
+ dup v12.2d, v14.d[0]
+ dup v13.2d, v14.d[0]
+ st1 {v12.8h,v13.8h}, [x12], #32
+ st1 {v14.8h,v15.8h}, [x12]
+ sub x12, x12, #32
+ ld1 {v11.8h,v12.8h}, [x14]
+ st1 {v11.8h,v12.8h}, [x15]
+.endif
+ /* Count off four pixels into the working buffer.
+ */
+ sub x11, x11, #4
+ /* Incoming pointers were to the first _legal_ pixel. Four pixels
+ * were read unconditionally, but some may have been discarded by
+ * xclip, so we rewind the pointers to compensate.
+ */
+ sub x4, x4, x13, LSL #(COMPONENT_SHIFT)
+ sub x5, x5, x13, LSL #(COMPONENT_SHIFT)
+ sub x6, x6, x13, LSL #(COMPONENT_SHIFT)
+ sub x7, x7, x13, LSL #(COMPONENT_SHIFT)
+
+ /* First tap starts where we just pre-filled, at the end of the
+ * buffer.
+ */
+ add x2, x2, #(CHUNKSIZE * 2 - 4) << 16
+
+ /* Use overflowing arithmetic to implement wraparound array
+ * indexing.
+ */
+ lsl x2, x2, #(47 - CHUNKSHIFT)
+ lsl x3, x3, #(47 - CHUNKSHIFT)
+
+
+ /* Start of outermost loop.
+ * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
+ * number of iterations of the inner loop that can be performed and
+ * get into that.
+ *
+ * The fill is complicated by the possibility of running out of
+ * input before the scratch buffer is filled. If this isn't a risk
+ * then it's handled by the simple loop at 2:, otherwise the
+ * horrible loop at 3:.
+ */
+1: mov v3.8b, v4.8b /* put y scaling coefficients somewhere handy */
+ subs x11, x11, #CHUNKSIZE
+ bge 2f /* if at least CHUNKSIZE are available... */
+ add x11, x11, #CHUNKSIZE /* if they're not... */
+ b 4f
+ /* basic fill loop, processing 8 bytes at a time until there are
+ * fewer than eight bytes available.
+ */
+3: vert8
+ sub x11, x11, #8 / COMPONENT_COUNT
+ st1 {v12.8h}, [x12], #16
+4: cmp x11, #8 / COMPONENT_COUNT - 1
+ bgt 3b
+.if \comp == 4
+ blt 3f
+ /* The last pixel (four bytes) if necessary */
+ vert4
+.else
+ cmp x11, #1
+ blt 3f
+ /* The last pixels if necessary */
+ sub x4, x4, #8
+ sub x5, x5, #8
+ sub x6, x6, #8
+ sub x7, x7, #8
+ add x4, x4, x11, LSL #(COMPONENT_SHIFT)
+ add x5, x5, x11, LSL #(COMPONENT_SHIFT)
+ add x6, x6, x11, LSL #(COMPONENT_SHIFT)
+ add x7, x7, x11, LSL #(COMPONENT_SHIFT)
+ vert8
+ sub x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
+ sub sp, sp, #32
+ sub x11, x11, #16
+.if \comp == 1
+ dup v13.8h, v12.h[7]
+.elseif \comp == 2
+ dup v13.4s, v12.s[3]
+.endif
+ st1 {v12.8h,v13.8h}, [sp]
+ ld1 {v12.8h}, [x11]
+ add sp, sp, #32
+ b 4f
+.endif
+ /* Keep filling until we get to the end of this chunk of the buffer */
+3:
+.if \comp == 1
+ dup v12.8h, v12.h[7]
+.elseif \comp == 2
+ dup v12.4s, v12.s[3]
+.elseif \comp == 4
+ dup v12.2d, v12.d[1]
+.endif
+4: st1 {v12.8h}, [x12], #16
+ tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+ bne 3b
+ b 4f
+
+.align 4
+2: /* Quickly pull a chunk of data into the working buffer.
+ */
+ vert8
+ st1 {v12.8h}, [x12], #16
+ vert8
+ st1 {v12.8h}, [x12], #16
+ tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+ bne 2b
+ cmp x11, #0
+ bne 3f
+4: /* if we end with 0 pixels left we'll have nothing handy to spread
+ * across to the right, so we rewind a bit.
+ */
+ mov x11, #1
+ sub x4, x4, #COMPONENT_COUNT
+ sub x5, x5, #COMPONENT_COUNT
+ sub x6, x6, #COMPONENT_COUNT
+ sub x7, x7, #COMPONENT_COUNT
+3: /* copy four taps (width of cubic window) to far end for overflow
+ * address handling
+ */
+ sub x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+ eor x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+ ld1 {v14.4h}, [x13]
+.elseif \comp == 2
+ ld1 {v14.8h}, [x13]
+.elseif \comp == 4
+ ld1 {v14.8h,v15.8h}, [x13]
+.endif
+ add x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+ st1 {v14.4h}, [x13]
+.elseif \comp == 2
+ st1 {v14.8h}, [x13]
+.elseif \comp == 4
+ st1 {v14.8h,v15.8h}, [x13]
+.endif
+ /* The high 32-bits of x10 contains the maximum possible iteration
+ * count, but if x8 is greater than the low 32-bits of x10 then
+ * this indicates that the count must be reduced by one for this
+ * iteration to avoid reading past the end of the available data.
+ */
+ sub x13, x10, x8
+ lsr x13, x13, #32
+
+ madd x8, x13, x9, x8
+ sub x8, x8, #(CHUNKSIZE << 16)
+
+ /* prefer to count pixels, rather than vectors, to clarify the tail
+ * store case on exit.
+ */
+ lsl x13, x13, #VECSHIFT
+ cmp x13, x1
+ csel x13, x1, x13, gt
+
+ sub x1, x1, x13
+
+ lsl x13, x13, #COMPONENT_SHIFT
+
+ mov w14, #0x8000
+ movi v30.8h, #3
+ dup v31.8h, w14
+
+ cmp x13, #0
+ bgt 3f
+ cmp x1, #0
+ bgt 1b /* an extreme case where we shouldn't use code in this structure */
+ b 9f
+
+ .align 4
+2: /* Inner loop continues here, but starts at 3:, see end of loop
+ * below for explanation. */
+.if LOOP_OUTPUT_SIZE == 4
+ st1 {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+ st1 {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+ st1 {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+ st1 {v8.16b,v9.16b}, [x0], #32
+.endif
+ /* Inner loop: here the four x coefficients for each tap are
+ * calculated in vector code, and the addresses are calculated in
+ * scalar code, and these calculations are interleaved.
+ */
+3: ushr v8.8h, v6.8h, #1 // sxf
+ lsr x14, x2, #(63 - CHUNKSHIFT)
+ sqrdmulh v9.8h, v8.8h, v8.8h // sxf**2
+ add x2, x2, x3
+ sqrdmulh v10.8h, v9.8h, v8.8h // sxf**3
+ lsr x15, x2, #(63 - CHUNKSHIFT)
+ sshll v11.4s, v9.4h, #2
+ sshll2 v12.4s, v9.8h, #2
+ add x2, x2, x3
+ smlsl v11.4s, v10.4h, v30.4h
+ smlsl2 v12.4s, v10.8h, v30.8h
+ lsr x16, x2, #(63 - CHUNKSHIFT)
+
+ shadd v0.8h, v10.8h, v8.8h
+ add x2, x2, x3
+ sub v0.8h, v9.8h, v0.8h
+ lsr x17, x2, #(63 - CHUNKSHIFT)
+
+ saddw v1.4s, v11.4s, v9.4h
+ saddw2 v13.4s, v12.4s, v9.8h
+ add x2, x2, x3
+ shrn v1.4h, v1.4s, #1
+ shrn2 v1.8h, v13.4s, #1
+ add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+ sub v1.8h, v1.8h, v31.8h
+ add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+
+ saddw v2.4s, v11.4s, v8.4h
+ saddw2 v13.4s, v12.4s, v8.8h
+ add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+ shrn v2.4h, v2.4s, #1
+ shrn2 v2.8h, v13.4s, #1
+ add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+ neg v2.8h, v2.8h
+
+ shsub v3.8h, v10.8h, v9.8h
+
+ /* increment the x fractional parts (oveflow is ignored, as the
+ * scalar arithmetic shadows this addition with full precision).
+ */
+ add v6.8h, v6.8h, v7.8h
+
+ /* At this point we have four pointers in x8-x11, pointing to the
+ * four taps in the scratch buffer that must be convolved together
+ * to produce an output pixel (one output pixel per pointer).
+ * These pointers usually overlap, but their spacing is irregular
+ * so resolving the redundancy through L1 is a pragmatic solution.
+ *
+ * The scratch buffer is made of signed 16-bit data, holding over
+ * some extra precision, and overshoot, from the vertical pass.
+ *
+ * We also have the 16-bit unsigned fixed-point weights for each
+ * of the four taps in v0 - v3. That's eight pixels worth of
+ * coefficients when we have only four pointers, so calculations
+ * for four more pixels are interleaved with the fetch and permute
+ * code for each variant in the following code.
+ *
+ * The data arrangement is less than ideal for any pixel format,
+ * but permuting loads help to mitigate most of the problems.
+ *
+ * Note also that the two outside taps of a bicubic are negative,
+ * but these coefficients are unsigned. The sign is hard-coded by
+ * use of multiply-and-subtract operations.
+ */
+.if \comp == 1
+ /* The uchar 1 case.
+ * Issue one lanewise ld4.h to load four consecutive pixels from
+ * one pointer (one pixel) into four different registers; then load
+ * four consecutive s16 values from the next pointer (pixel) into
+ * the next lane of those four registers, etc., so that we finish
+ * with v12 - v15 representing the four taps, and each lane
+ * representing a separate pixel.
+ *
+ * The first ld4 uses a splat to avoid any false dependency on
+ * the previous state of the register.
+ */
+ ld4r {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
+ lsr x14, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.h,v13.h,v14.h,v15.h}[1], [x15]
+ add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+ lsr x15, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.h,v13.h,v14.h,v15.h}[2], [x16]
+ add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+ lsr x16, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.h,v13.h,v14.h,v15.h}[3], [x17]
+ add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+ lsr x17, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.h,v13.h,v14.h,v15.h}[4], [x14]
+ add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+ ld4 {v12.h,v13.h,v14.h,v15.h}[5], [x15]
+ ld4 {v12.h,v13.h,v14.h,v15.h}[6], [x16]
+ ld4 {v12.h,v13.h,v14.h,v15.h}[7], [x17]
+
+ smull v8.4s, v12.4h, v0.4h
+ smull2 v9.4s, v12.8h, v0.8h
+ smlsl v8.4s, v13.4h, v1.4h
+ smlsl2 v9.4s, v13.8h, v1.8h
+ smlsl v8.4s, v14.4h, v2.4h
+ smlsl2 v9.4s, v14.8h, v2.8h
+ smlal v8.4s, v15.4h, v3.4h
+ smlal2 v9.4s, v15.8h, v3.8h
+
+ subs x13, x13, #LOOP_OUTPUT_SIZE
+
+ sqrshrn v8.4h, v8.4s, #15
+ sqrshrn2 v8.8h, v9.4s, #15
+
+ sqrshrun v8.8b, v8.8h, #VERTBITS - 8
+.elseif \comp == 2
+ /* The uchar2 case:
+ * This time load pairs of values into adjacent lanes in v12 - v15
+ * by aliasing them as u32 data; leaving room for only four pixels,
+ * so the process has to be done twice. This also means that the
+ * coefficient registers fail to align with the coefficient data
+ * (eight separate pixels), so that has to be doubled-up to match.
+ */
+ ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+ lsr x14, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+ add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+ lsr x15, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+ add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+ lsr x16, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+ add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+ lsr x17, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+
+ /* double-up coefficients to align with component pairs */
+ zip1 v16.8h, v0.8h, v0.8h
+ add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+ zip1 v17.8h, v1.8h, v1.8h
+ zip1 v18.8h, v2.8h, v2.8h
+ zip1 v19.8h, v3.8h, v3.8h
+
+ smull v8.4s, v12.4h, v16.4h
+ smull2 v9.4s, v12.8h, v16.8h
+ smlsl v8.4s, v13.4h, v17.4h
+ smlsl2 v9.4s, v13.8h, v17.8h
+ smlsl v8.4s, v14.4h, v18.4h
+ smlsl2 v9.4s, v14.8h, v18.8h
+ smlal v8.4s, v15.4h, v19.4h
+ smlal2 v9.4s, v15.8h, v19.8h
+
+ sqrshrn v8.4h, v8.4s, #15
+ sqrshrn2 v8.8h, v9.4s, #15
+
+ ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+ ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+
+ /* double-up coefficients to align with component pairs */
+ zip2 v16.8h, v0.8h, v0.8h
+ zip2 v17.8h, v1.8h, v1.8h
+ zip2 v18.8h, v2.8h, v2.8h
+ zip2 v19.8h, v3.8h, v3.8h
+
+ smull v10.4s, v12.4h, v16.4h
+ smull2 v11.4s, v12.8h, v16.8h
+ smlsl v10.4s, v13.4h, v17.4h
+ smlsl2 v11.4s, v13.8h, v17.8h
+ smlsl v10.4s, v14.4h, v18.4h
+ smlsl2 v11.4s, v14.8h, v18.8h
+ smlal v10.4s, v15.4h, v19.4h
+ smlal2 v11.4s, v15.8h, v19.8h
+
+ subs x13, x13, #LOOP_OUTPUT_SIZE
+
+ sqrshrn v9.4h, v10.4s, #15
+ sqrshrn2 v9.8h, v11.4s, #15
+
+ sqrshrun v8.8b, v8.8h, #VERTBITS - 8
+ sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8
+.elseif \comp == 4
+ /* The uchar4 case.
+ * This case is comparatively painless because four s16s are the
+ * smallest addressable unit for a vmul-by-scalar. Rather than
+ * permute the data, simply arrange the multiplies to suit the way
+ * the data comes in. That's a lot of data, though, so things
+ * progress in pairs of pixels at a time.
+ */
+ ld1 {v12.8h,v13.8h}, [x14]
+ lsr x14, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld1 {v14.8h,v15.8h}, [x15]
+ add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+ lsr x15, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+
+ smull v8.4s, v12.4h, v0.h[0]
+ smull v9.4s, v14.4h, v0.h[1]
+ smlsl2 v8.4s, v12.8h, v1.h[0]
+ smlsl2 v9.4s, v14.8h, v1.h[1]
+ smlsl v8.4s, v13.4h, v2.h[0]
+ smlsl v9.4s, v15.4h, v2.h[1]
+ smlal2 v8.4s, v13.8h, v3.h[0]
+ smlal2 v9.4s, v15.8h, v3.h[1]
+
+ /* And two more... */
+ ld1 {v12.8h,v13.8h}, [x16]
+ add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+ lsr x16, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld1 {v14.8h,v15.8h}, [x17]
+ add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+ lsr x17, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+
+ sqrshrn v8.4h, v8.4s, #15
+ add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+ sqrshrn2 v8.8h, v9.4s, #15
+
+ smull v10.4s, v12.4h, v0.h[2]
+ smull v11.4s, v14.4h, v0.h[3]
+ smlsl2 v10.4s, v12.8h, v1.h[2]
+ smlsl2 v11.4s, v14.8h, v1.h[3]
+ smlsl v10.4s, v13.4h, v2.h[2]
+ smlsl v11.4s, v15.4h, v2.h[3]
+ smlal2 v10.4s, v13.8h, v3.h[2]
+ smlal2 v11.4s, v15.8h, v3.h[3]
+
+ sqrshrn v9.4h, v10.4s, #15
+ sqrshrn2 v9.8h, v11.4s, #15
+
+ sqrshrun v8.8b, v8.8h, #VERTBITS - 8
+ sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8
+
+ /* And two more... */
+ ld1 {v12.8h,v13.8h}, [x14]
+ ld1 {v14.8h,v15.8h}, [x15]
+
+ smull v10.4s, v12.4h, v0.h[4]
+ smull v11.4s, v14.4h, v0.h[5]
+ smlsl2 v10.4s, v12.8h, v1.h[4]
+ smlsl2 v11.4s, v14.8h, v1.h[5]
+ smlsl v10.4s, v13.4h, v2.h[4]
+ smlsl v11.4s, v15.4h, v2.h[5]
+ smlal2 v10.4s, v13.8h, v3.h[4]
+ smlal2 v11.4s, v15.8h, v3.h[5]
+
+ /* And two more... */
+ ld1 {v12.8h,v13.8h}, [x16]
+ ld1 {v14.8h,v15.8h}, [x17]
+
+ subs x13, x13, #LOOP_OUTPUT_SIZE
+
+ sqrshrn v9.4h, v10.4s, #15
+ sqrshrn2 v9.8h, v11.4s, #15
+
+ smull v10.4s, v12.4h, v0.h[6]
+ smull v11.4s, v14.4h, v0.h[7]
+ smlsl2 v10.4s, v12.8h, v1.h[6]
+ smlsl2 v11.4s, v14.8h, v1.h[7]
+ smlsl v10.4s, v13.4h, v2.h[6]
+ smlsl v11.4s, v15.4h, v2.h[7]
+ smlal2 v10.4s, v13.8h, v3.h[6]
+ smlal2 v11.4s, v15.8h, v3.h[7]
+
+ sqrshrn v10.4h, v10.4s, #15
+ sqrshrn2 v10.8h, v11.4s, #15
+
+ sqrshrun v9.8b, v9.8h, #VERTBITS - 8
+ sqrshrun2 v9.16b, v10.8h, #VERTBITS - 8
+.endif
+ bgt 2b /* continue inner loop */
+ /* The inner loop has already been limited to ensure that none of
+ * the earlier iterations could overfill the output, so the store
+ * appears within the loop but after the conditional branch (at the
+ * top). At the end, provided it won't overfill, perform the final
+ * store here. If it would, then break out to the tricky tail case
+ * instead.
+ */
+ blt 1f
+ /* Store the amount of data appropriate to the configuration of the
+ * instance being assembled.
+ */
+.if LOOP_OUTPUT_SIZE == 4
+ st1 {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+ st1 {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+ st1 {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+ st1 {v8.16b,v9.16b}, [x0], #32
+.endif
+ b 1b /* resume outer loop */
+ /* Partial tail store case:
+ * Different versions of the code need different subsets of the
+ * following partial stores. Here the number of components and the
+ * size of the chunk of data produced by each inner loop iteration
+ * is tested to figure out whether or not each phrase is relevant.
+ */
+.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
+1: tst x13, #16
+ beq 1f
+ st1 {v8.16b}, [x0], #16
+ mov v8.16b, v9.16b
+.endif
+.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
+1: tst x13, #8
+ beq 1f
+ st1 {v8.8b}, [x0], #8
+ ext v8.16b, v8.16b, v8.16b, #8
+.endif
+.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
+1: tst x13, #4
+ beq 1f
+ st1 {v8.s}[0], [x0], #4
+ ext v8.8b, v8.8b, v8.8b, #4
+.endif
+.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
+1: tst x13, #2
+ beq 1f
+ st1 {v8.h}[0], [x0], #2
+ ext v8.8b, v8.8b, v8.8b, #2
+.endif
+.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
+1: tst x13, #1
+ beq 1f
+ st1 {v8.b}[0], [x0], #1
+.endif
+1:
+9: mov sp, x19
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ldr x19, [sp], #16
+ ret
+END(rsdIntrinsicResizeB\comp\()_K)
+.endr
+
+.rodata
+intrinsic_resize_consts: .hword 0, 1, 2, 3, 4, 5, 6, 7
diff --git a/toolkit/Resize_neon.S b/toolkit/Resize_neon.S
new file mode 100644
index 0000000..eb7f694
--- /dev/null
+++ b/toolkit/Resize_neon.S
@@ -0,0 +1,799 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
+ * integer (bicubic has a little overshoot). It would also be possible to add
+ * a temporary DC bias to eliminate the sign bit for more precision, but that's
+ * extra arithmetic.
+ */
+.set VERTBITS, 14
+
+/* The size of the scratch buffer in which we store our vertically convolved
+ * intermediates.
+ */
+.set CHUNKSHIFT, 7
+.set CHUNKSIZE, (1 << CHUNKSHIFT)
+
+/* The number of components processed in a single iteration of the innermost
+ * loop.
+ */
+.set VECSHIFT, 3
+.set VECSIZE, (1<<VECSHIFT)
+
+/* Read four different lines (except at edges where addresses may be clamped,
+ * which is why we don't simply take base and stride registers), and multiply
+ * and accumulate them by the coefficients in d6[0..3], leaving the results in
+ * q12. This gives eight 16-bit results representing a horizontal line of 2-8
+ * input pixels (depending on number of components per pixel) to be fed into
+ * the horizontal scaling pass.
+ *
+ * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
+ * known to represent negative values and VMLS is used to implement this).
+ * Output is VERTBITS signed fixed-point, which must leave room for a little
+ * bit of overshoot beyond [0,1.0).
+ */
+.macro vert8, dstlo=d24, dsthi=d25
+ vld1.u8 d16, [r4]!
+ vld1.u8 d18, [r5]!
+ vld1.u8 d20, [r6]!
+ vld1.u8 d22, [r7]!
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+ vmull.u16 q12, d18, d6[1]
+ vmull.u16 q13, d19, d6[1]
+ vmlsl.u16 q12, d16, d6[0]
+ vmlsl.u16 q13, d17, d6[0]
+ vmlal.u16 q12, d20, d6[2]
+ vmlal.u16 q13, d21, d6[2]
+ vmlsl.u16 q12, d22, d6[3]
+ vmlsl.u16 q13, d23, d6[3]
+
+ /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
+ * minus VERTBITS (the number of fraction bits we want to keep from
+ * here on).
+ */
+ vqshrn.s32 \dstlo, q12, #8 + 16 - VERTBITS
+ vqshrn.s32 \dsthi, q13, #8 + 16 - VERTBITS
+.endm
+
+/* As above, but only four 16-bit results into d25.
+ */
+.macro vert4
+ vld1.u32 d16[0], [r4]!
+ vld1.u32 d18[0], [r5]!
+ vld1.u32 d20[0], [r6]!
+ vld1.u32 d22[0], [r7]!
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+ vmull.u16 q12, d18, d6[1]
+ vmlsl.u16 q12, d16, d6[0]
+ vmlal.u16 q12, d20, d6[2]
+ vmlsl.u16 q12, d22, d6[3]
+ vqshrn.s32 d25, q12, #8 + 16 - VERTBITS
+.endm
+
+
+/* During horizontal resize having CHUNKSIZE input available means being able
+ * to produce a varying amount of output, depending on the phase of the data.
+ * This function calculates the minimum number of VECSIZE chunks extracted from
+ * a CHUNKSIZE window (r1), and the threshold value for when the count will be
+ * one higher than that (r0).
+ * These work out, conveniently, to be the quotient and remainder from:
+ * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
+ *
+ * The two values can be packed together in a uint64_t for convenience; and
+ * they are, in fact, used this way as an arithmetic short-cut later on.
+ */
+
+/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */
+ENTRY(rsdIntrinsicResize_oscctl_K)
+ lsl r2, r0, #VECSHIFT
+ movw r0, #:lower16:(CHUNKSIZE << 16) - 1
+ movt r0, #:upper16:(CHUNKSIZE << 16) - 1
+ add r0, r0, r2
+#if defined(ARCH_ARM_USE_UDIV)
+ udiv r1, r0, r2
+ mls r0, r1, r2, r0
+#else
+ clz r3, r2
+ clz r1, r0
+ subs r3, r3, r1
+ movlt r3, #0
+ mov r1, #1
+ lsl r2, r2, r3
+ lsl r3, r1, r3
+ mov r1, #0
+1: cmp r2, r0
+ addls r1, r3
+ subls r0, r2
+ lsrs r3, r3, #1
+ lsr r2, r2, #1
+ bne 1b
+#endif
+ bx lr
+END(rsdIntrinsicResize_oscctl_K)
+
+/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
+ * For the most part the vertical pass (the outer loop) is the same for all
+ * versions. Exceptions are handled in-line with conditional assembly.
+ */
+.irp comp, 1, 2, 4
+.if \comp == 1
+.set COMPONENT_SHIFT, 0
+.elseif \comp == 2
+.set COMPONENT_SHIFT, 1
+.elseif \comp == 4
+.set COMPONENT_SHIFT, 2
+.else
+.error "Unknown component count"
+.endif
+.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
+.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
+
+.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
+.set OSC_STORE, (BUFFER_SIZE + 0)
+.set OSCSTEP_STORE, (BUFFER_SIZE + 4)
+.set OSCCTL_STORE, (BUFFER_SIZE + 8)
+.set AVAIL_STORE, (BUFFER_SIZE + 16)
+.set SP_STORE, (BUFFER_SIZE + 24) /* should be +20, but rounded up to make a legal constant somewhere */
+
+/* void rsdIntrinsicResizeB\comp\()_K(
+ * uint8_t * restrict dst, // r0
+ * size_t count, // r1
+ * uint32_t xf, // r2
+ * uint32_t xinc, // r3
+ * uint8_t const * restrict srcn, // [sp] -> [sp,#104] -> r4
+ * uint8_t const * restrict src0, // [sp,#4] -> [sp,#108] -> r5
+ * uint8_t const * restrict src1, // [sp,#8] -> [sp,#112] -> r6
+ * uint8_t const * restrict src2, // [sp,#12] -> [sp,#116] -> r7
+ * size_t xclip, // [sp,#16] -> [sp,#120]
+ * size_t avail, // [sp,#20] -> [sp,#124] -> lr
+ * uint64_t osc_ctl, // [sp,#24] -> [sp,#128]
+ * int32_t const *yr); // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access)
+ */
+ENTRY(rsdIntrinsicResizeB\comp\()_K)
+ push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ vpush {d8-d15}
+
+ /* align the working buffer on the stack to make it easy to use bit
+ * twiddling for address calculations and bounds tests.
+ */
+ sub r12, sp, #BUFFER_SIZE + 32
+ mov lr, sp
+ bfc r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1
+ mov sp, r12
+ str lr, [sp,#SP_STORE]
+
+ ldr r8, [lr,#136] // yr
+ adr r9, 8f
+ vld1.s32 {q4}, [r8]
+ vld1.s16 {q5}, [r9]
+ vqmovun.s32 d8, q4 // yr
+ vdup.s16 q6, r2
+ vdup.s16 q7, r3
+ vmla.s16 q6, q5, q7 // vxf
+ vshl.s16 q7, q7, #VECSHIFT // vxinc
+
+ ldrd r4,r5, [lr,#104] // srcn, src0
+ ldrd r6,r7, [lr,#112] // src1, src2
+
+ /* Compute starting condition for oscillator used to compute ahead
+ * of time how many iterations are possible before needing to
+ * refill the working buffer. This is based on the fixed-point
+ * index of the last element in the vector of pixels processed in
+ * each iteration, counting up until it would overflow.
+ */
+ sub r8, r2, r3
+ mov r9, r3, LSL #VECSHIFT
+ add r8, r8, r9
+
+ ldrd r10,r11, [lr,#128] // osc_ctl
+
+ str r8, [sp,#OSC_STORE]
+ str r9, [sp,#OSCSTEP_STORE]
+ str r10, [sp,#OSCCTL_STORE]
+ str r11, [sp,#OSCCTL_STORE+4]
+ ldrd r10,r11, [lr,#120] // xclip,avail
+
+
+ /* r4-r7 contain pointers to the four lines of input to be
+ * convolved. These pointers have been clamped vertically and
+ * horizontally (which is why it's not a simple row/stride pair),
+ * and the xclip argument (now in r10) indicates how many pixels
+ * from true the x position of the pointer is. This value should
+ * be 0, 1, or 2 only.
+ *
+ * Start by placing four pixels worth of input at the far end of
+ * the buffer. As many as two of these may be clipped, so four
+ * pixels are fetched, and then the first pixel is duplicated and
+ * the data shifted according to xclip. The source pointers are
+ * then also adjusted according to xclip so that subsequent fetches
+ * match.
+ */
+ vmov d6, d8 /* make y coeffs available for vert4 and vert8 macros */
+
+ sub r8, r12, r10, LSL #COMPONENT_SHIFT + 1
+ add r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
+ add r8, r8, #4 * COMPONENT_COUNT * 2
+.if \comp == 1
+ vert4
+ vdup.s16 d24, d25[0]
+ vst1.s16 {q12}, [r12]
+ vld1.s16 {d24}, [r8]
+ vst1.s16 {d24}, [r9]
+.elseif \comp == 2
+ vert8
+ vdup.u32 q11, d24[0]
+ vst1.s16 {q11,q12}, [r12]
+ vld1.s16 {q12}, [r8]
+ vst1.s16 {q12}, [r9]
+.elseif \comp == 4
+ vert8 d28, d29
+ vert8 d30, d31
+ vmov.u64 d24, d28
+ vmov.u64 d25, d28
+ vmov.u64 d26, d28
+ vmov.u64 d27, d28
+ vst1.s16 {q12,q13}, [r12]!
+ vst1.s16 {q14,q15}, [r12]
+ sub r12, r12, #32
+ vld1.s16 {q11,q12}, [r8]
+ vst1.s16 {q11,q12}, [r9]
+.endif
+ /* Count off four pixels into the working buffer, and move count to
+ * its new home.
+ */
+ sub lr, r11, #4
+ /* Incoming pointers were to the first _legal_ pixel. Four pixels
+ * were read unconditionally, but some may have been discarded by
+ * xclip, so we rewind the pointers to compensate.
+ */
+ sub r4, r4, r10, LSL #COMPONENT_SHIFT
+ sub r5, r5, r10, LSL #COMPONENT_SHIFT
+ sub r6, r6, r10, LSL #COMPONENT_SHIFT
+ sub r7, r7, r10, LSL #COMPONENT_SHIFT
+
+ /* First tap starts where we just pre-filled, at the end of the
+ * buffer.
+ */
+ add r2, r2, #(CHUNKSIZE * 2 - 4) << 16
+
+ /* Use overflowing arithmetic to implement wraparound array
+ * indexing.
+ */
+ mov r2, r2, LSL #(15 - CHUNKSHIFT)
+ mov r3, r3, LSL #(15 - CHUNKSHIFT)
+
+ str lr, [sp,#AVAIL_STORE]
+
+ /* Start of outermost loop.
+ * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
+ * number of iterations of the inner loop that can be performed and
+ * get into that.
+ *
+ * The fill is complicated by the possibility of running out of
+ * input before the scratch buffer is filled. If this isn't a risk
+ * then it's handled by the simple loop at 2:, otherwise the
+ * horrible loop at 3:.
+ */
+1: ldr lr, [sp,#AVAIL_STORE] /* get number of pixels available */
+ vmov d6, d8 /* put y scaling coefficients somewhere handy */
+ subs lr, #CHUNKSIZE
+ bge 2f /* if at least CHUNKSIZE are available... */
+ add lr, #CHUNKSIZE /* if they're not... */
+ b 4f
+ /* ..just sneaking a literal in here after this unconditional branch.. */
+8: .hword 0, 1, 2, 3, 4, 5, 6, 7
+ /* basic fill loop, processing 8 bytes at a time until there are
+ * fewer than eight bytes available.
+ */
+3: vert8
+ sub lr, lr, #8 / COMPONENT_COUNT
+ vst1.s16 {q12}, [r12]!
+4: cmp lr, #8 / COMPONENT_COUNT - 1
+ bgt 3b
+.if \comp == 4
+ blt 3f
+ /* The last pixel (four bytes) if necessary */
+ vert4
+.else
+ cmp lr, #1
+ blt 3f
+ /* The last pixels if necessary */
+ sub r4, r4, #8
+ sub r5, r5, #8
+ sub r6, r6, #8
+ sub r7, r7, #8
+ add r4, r4, lr, LSL #COMPONENT_SHIFT
+ add r5, r5, lr, LSL #COMPONENT_SHIFT
+ add r6, r6, lr, LSL #COMPONENT_SHIFT
+ add r7, r7, lr, LSL #COMPONENT_SHIFT
+ vert8
+ sub lr, sp, lr, LSL #COMPONENT_SHIFT + 1
+ sub sp, sp, #32
+ sub lr, lr, #16
+.if \comp == 1
+ vdup.s16 q13, d25[3]
+.elseif \comp == 2
+ vdup.u32 q13, d25[1]
+.endif
+ vst1.s16 {q12,q13}, [sp]
+ vld1.s16 {q12}, [lr]
+ add sp, sp, #32
+ b 4f
+.endif
+ /* Keep filling until we get to the end of this chunk of the buffer */
+3:
+.if \comp == 1
+ vdup.s16 q12, d25[3]
+.elseif \comp == 2
+ vdup.u32 q12, d25[1]
+.elseif \comp == 4
+ vmov.u64 d24, d25
+.endif
+4: vst1.s16 {q12}, [r12]!
+ tst r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+ bne 3b
+ b 4f
+
+.align 4
+2: /* Quickly pull a chunk of data into the working buffer.
+ */
+ vert8
+ vst1.s16 {q12}, [r12]!
+ vert8
+ vst1.s16 {q12}, [r12]!
+ tst r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+ bne 2b
+ cmp lr, #0
+ bne 3f
+4: /* if we end with 0 pixels left we'll have nothing handy to spread
+ * across to the right, so we rewind a bit.
+ */
+ mov lr, #1
+ sub r4, r4, #COMPONENT_COUNT
+ sub r5, r5, #COMPONENT_COUNT
+ sub r6, r6, #COMPONENT_COUNT
+ sub r7, r7, #COMPONENT_COUNT
+3: str lr, [sp,#AVAIL_STORE] /* done with available pixel count */
+ add lr, sp, #OSC_STORE
+ ldrd r8,r9, [lr,#0] /* need osc, osc_step soon */
+ ldrd r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */
+
+ /* copy four taps (width of cubic window) to far end for overflow
+ * address handling
+ */
+ sub lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
+ eor r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+ vld1.s16 {d28}, [lr]
+.elseif \comp == 2
+ vld1.s16 {q14}, [lr]
+.elseif \comp == 4
+ vld1.s16 {q14,q15}, [lr]
+.endif
+ add lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+ vst1.s16 {d28}, [lr]
+.elseif \comp == 2
+ vst1.s16 {q14}, [lr]
+.elseif \comp == 4
+ vst1.s16 {q14,q15}, [lr]
+.endif
+ /* r11 contains the maximum possible iteration count, but if r8 is
+ * greater than r10 then this indicates that the count must be
+ * reduced by one for this iteration to avoid reading past the end
+ * of the available data.
+ */
+ cmp r10, r8
+ sbc lr, r11, #0
+
+ mla r8, lr, r9, r8
+ sub r8, r8, #(CHUNKSIZE << 16)
+
+ str r8, [sp,#OSC_STORE] /* done with osc */
+
+ /* prefer to count pixels, rather than vectors, to clarify the tail
+ * store case on exit.
+ */
+ mov lr, lr, LSL #VECSHIFT
+ cmp lr, r1
+ movgt lr, r1
+
+ sub r1, r1, lr
+
+ mov lr, lr, LSL #COMPONENT_SHIFT
+
+ vmov.i16 d10, #3
+ vmov.i16 d11, #0x8000
+
+ cmp lr, #0
+ bgt 3f
+ cmp r1, #0
+ bgt 1b /* an extreme case where we shouldn't use code in this structure */
+ b 9f
+
+ .align 4
+2: /* Inner loop continues here, but starts at 3:, see end of loop
+ * below for explanation. */
+.if LOOP_OUTPUT_SIZE == 4
+ vst1.u32 {d16[0]}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 8
+ vst1.u8 {d16}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 16
+ vst1.u8 {q8}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 32
+ vst1.u8 {q8,q9}, [r0]!
+.endif
+ /* Inner loop: here the four x coefficients for each tap are
+ * calculated in vector code, and the addresses are calculated in
+ * scalar code, and these calculations are interleaved.
+ */
+3: vshr.u16 q8, q6, #1
+ mov r8, r2, LSR #(31 - CHUNKSHIFT)
+ vqrdmulh.s16 q9, q8, q8
+ add r2, r2, r3
+ vqrdmulh.s16 q10, q9, q8
+ mov r9, r2, LSR #(31 - CHUNKSHIFT)
+ vshll.s16 q11, d18, #2
+ vshll.s16 q12, d19, #2
+ add r2, r2, r3
+ vmlsl.s16 q11, d20, d10
+ vmlsl.s16 q12, d21, d10
+ mov r10, r2, LSR #(31 - CHUNKSHIFT)
+
+ vhadd.s16 q0, q10, q8
+ add r2, r2, r3
+ vsub.s16 q0, q9, q0
+ mov r11, r2, LSR #(31 - CHUNKSHIFT)
+
+ vaddw.s16 q1, q11, d18
+ vaddw.s16 q13, q12, d19
+ add r2, r2, r3
+ vshrn.s32 d2, q1, #1
+ vshrn.s32 d3, q13, #1
+ add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+ vsub.s16 d2, d2, d11
+ vsub.s16 d3, d3, d11 // TODO: find a wider d11 and use q-reg operation
+ add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+
+ vaddw.s16 q2, q11, d16
+ vaddw.s16 q13, q12, d17
+ add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+ vshrn.s32 d4, q2, #1
+ vshrn.s32 d5, q13, #1
+ add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+ vneg.s16 q2, q2
+
+ vhsub.s16 q3, q10, q9
+
+ /* increment the x fractional parts (oveflow is ignored, as the
+ * scalar arithmetic shadows this addition with full precision).
+ */
+ vadd.s16 q6, q6, q7
+
+ /* At this point we have four pointers in r8-r11, pointing to the
+ * four taps in the scratch buffer that must be convolved together
+ * to produce an output pixel (one output pixel per pointer).
+ * These pointers usually overlap, but their spacing is irregular
+ * so resolving the redundancy through L1 is a pragmatic solution.
+ *
+ * The scratch buffer is made of signed 16-bit data, holding over
+ * some extra precision, and overshoot, from the vertical pass.
+ *
+ * We also have the 16-bit unsigned fixed-point weights for each
+ * of the four taps in q0 - q3. That's eight pixels worth of
+ * coefficients when we have only four pointers, so calculations
+ * for four more pixels are interleaved with the fetch and permute
+ * code for each variant in the following code.
+ *
+ * The data arrangement is less than ideal for any pixel format,
+ * but permuting loads help to mitigate most of the problems.
+ *
+ * Note also that the two outside taps of a bicubic are negative,
+ * but these coefficients are unsigned. The sign is hard-coded by
+ * use of multiply-and-subtract operations.
+ */
+.if \comp == 1
+ /* The uchar 1 case.
+ * Issue one lanewise vld4.s16 to load four consecutive pixels from
+ * one pointer (one pixel) into four different registers; then load
+ * four consecutive s16 values from the next pointer (pixel) into
+ * the next lane of those four registers, etc., so that we finish
+ * with q12 - q15 representing the four taps, and each lane
+ * representing a separate pixel.
+ *
+ * The first vld4 uses a splat to avoid any false dependency on
+ * the previous state of the register.
+ */
+ vld4.s16 {d24[],d26[],d28[],d30[]}, [r8]
+ mov r8, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+ vld4.s16 {d24[1],d26[1],d28[1],d30[1]}, [r9]
+ add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+ mov r9, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+ vld4.s16 {d24[2],d26[2],d28[2],d30[2]}, [r10]
+ add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+ mov r10, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+ vld4.s16 {d24[3],d26[3],d28[3],d30[3]}, [r11]
+ add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+ mov r11, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+ vld4.s16 {d25[],d27[],d29[],d31[]}, [r8]
+ add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+ vld4.s16 {d25[1],d27[1],d29[1],d31[1]}, [r9]
+ vld4.s16 {d25[2],d27[2],d29[2],d31[2]}, [r10]
+ vld4.s16 {d25[3],d27[3],d29[3],d31[3]}, [r11]
+
+ vmull.s16 q8, d24, d0
+ vmull.s16 q9, d25, d1
+ vmlsl.s16 q8, d26, d2
+ vmlsl.s16 q9, d27, d3
+ vmlsl.s16 q8, d28, d4
+ vmlsl.s16 q9, d29, d5
+ vmlal.s16 q8, d30, d6
+ vmlal.s16 q9, d31, d7
+
+ subs lr, lr, #LOOP_OUTPUT_SIZE
+
+ vqrshrn.s32 d16, q8, #15
+ vqrshrn.s32 d17, q9, #15
+
+ vqrshrun.s16 d16, q8, #VERTBITS - 8
+.elseif \comp == 2
+ /* The uchar2 case:
+ * This time load pairs of values into adjacent lanes in q12 - q15
+ * by aliasing them as u32 data; leaving room for only four pixels,
+ * so the process has to be done twice. This also means that the
+ * coefficient registers fail to align with the coefficient data
+ * (eight separate pixels), so that has to be doubled-up to match.
+ */
+ vld4.u32 {d24[],d26[],d28[],d30[]}, [r8]
+ mov r8, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+ vld4.u32 {d24[1],d26[1],d28[1],d30[1]}, [r9]
+ add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+ mov r9, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+ vld4.u32 {d25[],d27[],d29[],d31[]}, [r10]
+ add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+ mov r10, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+ vld4.u32 {d25[1],d27[1],d29[1],d31[1]}, [r11]
+ add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+ mov r11, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+
+ /* double-up coefficients to align with component pairs */
+ vmov d20, d0
+ add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+ vmov d21, d2
+ vmov d22, d4
+ vmov d23, d6
+ vzip.s16 d0, d20
+ vzip.s16 d2, d21
+ vzip.s16 d4, d22
+ vzip.s16 d6, d23
+
+ vmull.s16 q8, d24, d0
+ vmull.s16 q9, d25, d20
+ vmlsl.s16 q8, d26, d2
+ vmlsl.s16 q9, d27, d21
+ vmlsl.s16 q8, d28, d4
+ vmlsl.s16 q9, d29, d22
+ vmlal.s16 q8, d30, d6
+ vmlal.s16 q9, d31, d23
+
+ vqrshrn.s32 d16, q8, #15
+ vqrshrn.s32 d17, q9, #15
+
+ vld4.u32 {d24[],d26[],d28[],d30[]}, [r8]
+ vld4.u32 {d24[1],d26[1],d28[1],d30[1]}, [r9]
+ vld4.u32 {d25[],d27[],d29[],d31[]}, [r10]
+ vld4.u32 {d25[1],d27[1],d29[1],d31[1]}, [r11]
+
+ /* double-up coefficients to align with component pairs */
+ vmov d0, d1
+ vmov d2, d3
+ vmov d4, d5
+ vmov d6, d7
+ vzip.s16 d0, d1
+ vzip.s16 d2, d3
+ vzip.s16 d4, d5
+ vzip.s16 d6, d7
+
+ vmull.s16 q10, d24, d0
+ vmull.s16 q11, d25, d1
+ vmlsl.s16 q10, d26, d2
+ vmlsl.s16 q11, d27, d3
+ vmlsl.s16 q10, d28, d4
+ vmlsl.s16 q11, d29, d5
+ vmlal.s16 q10, d30, d6
+ vmlal.s16 q11, d31, d7
+
+ subs lr, lr, #LOOP_OUTPUT_SIZE
+
+ vqrshrn.s32 d18, q10, #15
+ vqrshrn.s32 d19, q11, #15
+
+ vqrshrun.s16 d16, q8, #VERTBITS - 8
+ vqrshrun.s16 d17, q9, #VERTBITS - 8
+.elseif \comp == 4
+ /* The uchar4 case.
+ * This case is comparatively painless because four s16s are the
+ * smallest addressable unit for a vmul-by-scalar. Rather than
+ * permute the data, simply arrange the multiplies to suit the way
+ * the data comes in. That's a lot of data, though, so things
+ * progress in pairs of pixels at a time.
+ */
+ vld1.s16 {q12,q13}, [r8]
+ mov r8, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+ vld1.s16 {q14,q15}, [r9]
+ add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+ mov r9, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+
+ vmull.s16 q8, d24, d0[0]
+ vmull.s16 q9, d28, d0[1]
+ vmlsl.s16 q8, d25, d2[0]
+ vmlsl.s16 q9, d29, d2[1]
+ vmlsl.s16 q8, d26, d4[0]
+ vmlsl.s16 q9, d30, d4[1]
+ vmlal.s16 q8, d27, d6[0]
+ vmlal.s16 q9, d31, d6[1]
+
+ /* And two more... */
+ vld1.s16 {q12,q13}, [r10]
+ add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+ mov r10, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+ vld1.s16 {q14,q15}, [r11]
+ add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+ mov r11, r2, LSR #(31 - CHUNKSHIFT)
+ add r2, r2, r3
+
+ vqrshrn.s32 d16, q8, #15
+ add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+ vqrshrn.s32 d17, q9, #15
+
+ vmull.s16 q10, d24, d0[2]
+ vmull.s16 q11, d28, d0[3]
+ vmlsl.s16 q10, d25, d2[2]
+ vmlsl.s16 q11, d29, d2[3]
+ vmlsl.s16 q10, d26, d4[2]
+ vmlsl.s16 q11, d30, d4[3]
+ vmlal.s16 q10, d27, d6[2]
+ vmlal.s16 q11, d31, d6[3]
+
+ vqrshrn.s32 d18, q10, #15
+ vqrshrn.s32 d19, q11, #15
+
+ vqrshrun.s16 d16, q8, #VERTBITS - 8
+ vqrshrun.s16 d17, q9, #VERTBITS - 8
+
+ /* And two more... */
+ vld1.s16 {q12,q13}, [r8]
+ vld1.s16 {q14,q15}, [r9]
+
+ vmull.s16 q10, d24, d1[0]
+ vmull.s16 q11, d28, d1[1]
+ vmlsl.s16 q10, d25, d3[0]
+ vmlsl.s16 q11, d29, d3[1]
+ vmlsl.s16 q10, d26, d5[0]
+ vmlsl.s16 q11, d30, d5[1]
+ vmlal.s16 q10, d27, d7[0]
+ vmlal.s16 q11, d31, d7[1]
+
+ /* And two more... */
+ vld1.s16 {q12,q13}, [r10]
+ vld1.s16 {q14,q15}, [r11]
+
+ subs lr, lr, #LOOP_OUTPUT_SIZE
+
+ vqrshrn.s32 d18, q10, #15
+ vqrshrn.s32 d19, q11, #15
+
+ vmull.s16 q10, d24, d1[2]
+ vmull.s16 q11, d28, d1[3]
+ vmlsl.s16 q10, d25, d3[2]
+ vmlsl.s16 q11, d29, d3[3]
+ vmlsl.s16 q10, d26, d5[2]
+ vmlsl.s16 q11, d30, d5[3]
+ vmlal.s16 q10, d27, d7[2]
+ vmlal.s16 q11, d31, d7[3]
+
+ vqrshrn.s32 d20, q10, #15
+ vqrshrn.s32 d21, q11, #15
+
+ vqrshrun.s16 d18, q9, #VERTBITS - 8
+ vqrshrun.s16 d19, q10, #VERTBITS - 8
+.endif
+ bgt 2b /* continue inner loop */
+ /* The inner loop has already been limited to ensure that none of
+ * the earlier iterations could overfill the output, so the store
+ * appears within the loop but after the conditional branch (at the
+ * top). At the end, provided it won't overfill, perform the final
+ * store here. If it would, then break out to the tricky tail case
+ * instead.
+ */
+ blt 1f
+ /* Store the amount of data appropriate to the configuration of the
+ * instance being assembled.
+ */
+.if LOOP_OUTPUT_SIZE == 4
+ vst1.u32 {d16[0]}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 8
+ vst1.u8 {d16}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 16
+ vst1.u8 {q8}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 32
+ vst1.u8 {q8,q9}, [r0]!
+.endif
+ b 1b /* resume outer loop */
+ /* Partial tail store case:
+ * Different versions of the code need different subsets of the
+ * following partial stores. Here the number of components and the
+ * size of the chunk of data produced by each inner loop iteration
+ * is tested to figure out whether or not each phrase is relevant.
+ */
+.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
+1: tst lr, #16
+ beq 1f
+ vst1.u8 {q8}, [r0]!
+ vmov q8, q9
+.endif
+.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
+1: tst lr, #8
+ beq 1f
+ vst1.u8 {d16}, [r0]!
+ vmov.u8 d16, d17
+.endif
+.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
+1: tst lr, #4
+ beq 1f
+ vst1.u32 {d16[0]}, [r0]!
+ vext.u32 d16, d16, d16, #1
+.endif
+.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
+1: tst lr, #2
+ beq 1f
+ vst1.u16 {d16[0]}, [r0]!
+ vext.u16 d16, d16, d16, #1
+.endif
+.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
+1: tst lr, #1
+ beq 1f
+ vst1.u8 {d16[0]}, [r0]!
+.endif
+1:
+9: ldr sp, [sp,#SP_STORE]
+ vpop {d8-d15}
+ pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+END(rsdIntrinsicResizeB\comp\()_K)
+.endr
diff --git a/toolkit/TaskProcessor.cpp b/toolkit/TaskProcessor.cpp
new file mode 100644
index 0000000..d9ae83c
--- /dev/null
+++ b/toolkit/TaskProcessor.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TaskProcessor.h"
+
+#include <assert.h>
+#include <sys/prctl.h>
+
+#include "RenderScriptToolkit.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.TaskProcessor"
+
+namespace android {
+namespace renderscript {
+
+int Task::setTiling(unsigned int targetTileSizeInBytes) {
+ // Empirically, values smaller than 1000 are unlikely to give good performance.
+ targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
+ const size_t cellSizeInBytes =
+ mVectorSize; // If we add float support, vectorSize * 4 for that.
+ const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
+ assert(targetCellsPerTile > 0);
+
+ size_t cellsToProcessY;
+ size_t cellsToProcessX;
+ if (mRestriction == nullptr) {
+ cellsToProcessX = mSizeX;
+ cellsToProcessY = mSizeY;
+ } else {
+ assert(mRestriction->endX > mRestriction->startX);
+ assert(mRestriction->endY > mRestriction->startY);
+ cellsToProcessX = mRestriction->endX - mRestriction->startX;
+ cellsToProcessY = mRestriction->endY - mRestriction->startY;
+ }
+
+ // We want rows as large as possible, as the SIMD code we have is more efficient with
+ // large rows.
+ mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
+ // Once we know the number of tiles per row, we divide that row evenly. We round up to make
+ // sure all cells are included in the last tile of the row.
+ mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
+
+ // We do the same thing for the Y direction.
+ size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
+ mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
+ mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
+
+ return mTilesPerRow * mTilesPerColumn;
+}
+
+void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
+ // Figure out the overall boundaries.
+ size_t startWorkX;
+ size_t startWorkY;
+ size_t endWorkX;
+ size_t endWorkY;
+ if (mRestriction == nullptr) {
+ startWorkX = 0;
+ startWorkY = 0;
+ endWorkX = mSizeX;
+ endWorkY = mSizeY;
+ } else {
+ startWorkX = mRestriction->startX;
+ startWorkY = mRestriction->startY;
+ endWorkX = mRestriction->endX;
+ endWorkY = mRestriction->endY;
+ }
+ // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
+ // first the X, Y coordinate of our tile in that grid.
+ size_t tileIndexY = tileIndex / mTilesPerRow;
+ size_t tileIndexX = tileIndex % mTilesPerRow;
+ // Calculate the starting and ending point of that tile.
+ size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
+ size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
+ size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
+ size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
+
+ // Call the derived class to do the specific work.
+ if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
+ // When the tile covers entire rows, we can take advantage that some ops are not 2D.
+ processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
+ } else {
+ processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
+ }
+}
+
+TaskProcessor::TaskProcessor(unsigned int numThreads)
+ : mUsesSimd{cpuSupportsSimd()},
+ /* If the requested number of threads is 0, we'll decide based on the number of cores.
+ * Through empirical testing, we've found that using more than 6 threads does not help.
+ * There may be more optimal choices to make depending on the SoC but we'll stick to
+ * this simple heuristic for now.
+ *
+ * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
+ * worker pool thread than the total number of threads.
+ */
+ mNumberOfPoolThreads{numThreads ? numThreads - 1
+ : std::min(6u, std::thread::hardware_concurrency() - 1)} {
+ for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
+ mPoolThreads.emplace_back(
+ std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
+ }
+}
+
+TaskProcessor::~TaskProcessor() {
+ {
+ std::lock_guard<std::mutex> lock(mQueueMutex);
+ mStopThreads = true;
+ mWorkAvailableOrStop.notify_all();
+ }
+
+ for (auto& thread : mPoolThreads) {
+ thread.join();
+ }
+}
+
+void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
+ if (threadIndex != 0) {
+ // Set the name of the thread, except for thread 0, which is not part of the pool.
+ // PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
+ char name[16]{"RenderScToolkit"};
+ prctl(PR_SET_NAME, name, 0, 0, 0);
+ // ALOGI("Starting thread%d", threadIndex);
+ }
+
+ std::unique_lock<std::mutex> lock(mQueueMutex);
+ while (true) {
+ mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() REQUIRES(mQueueMutex) {
+ return mStopThreads || (mTilesNotYetStarted > 0) ||
+ (returnWhenNoWork && (mTilesNotYetStarted == 0));
+ });
+ // ALOGI("Woke thread%d", threadIndex);
+
+ // This ScopedLockAssertion is to help the compiler when it checks thread annotations
+ // to realize that we have the lock. It's however not completely true; we don't
+ // hold the lock while processing the tile.
+ // TODO Figure out how to fix that.
+ android::base::ScopedLockAssertion lockAssert(mQueueMutex);
+ if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) {
+ break;
+ }
+
+ while (mTilesNotYetStarted > 0 && !mStopThreads) {
+ // This picks the tiles in decreasing order but that does not matter.
+ int myTile = --mTilesNotYetStarted;
+ mTilesInProcess++;
+ lock.unlock();
+ {
+ // We won't be executing this code unless the main thread is
+ // holding the mTaskMutex lock, which guards mCurrentTask.
+ // The compiler can't figure this out.
+ android::base::ScopedLockAssertion lockAssert(mTaskMutex);
+ mCurrentTask->processTile(threadIndex, myTile);
+ }
+ lock.lock();
+ mTilesInProcess--;
+ if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
+ mWorkIsFinished.notify_one();
+ }
+ }
+ }
+ // if (threadIndex != 0) {
+ // ALOGI("Ending thread%d", threadIndex);
+ // }
+}
+
+void TaskProcessor::doTask(Task* task) {
+ std::lock_guard<std::mutex> lockGuard(mTaskMutex);
+ task->setUsesSimd(mUsesSimd);
+ mCurrentTask = task;
+ // Notify the thread pool of available work.
+ startWork(task);
+ // Start processing some of the tiles on the calling thread.
+ processTilesOfWork(0, true);
+ // Wait for all the pool workers to complete.
+ waitForPoolWorkersToComplete();
+ mCurrentTask = nullptr;
+}
+
+void TaskProcessor::startWork(Task* task) {
+ /**
+ * The size in bytes that we're hoping each tile will be. If this value is too small,
+ * we'll spend too much time in synchronization. If it's too large, some cores may be
+ * idle while others still have a lot of work to do. Ideally, it would depend on the
+ * device we're running. 16k is the same value used by RenderScript and seems reasonable
+ * from ad-hoc tests.
+ */
+ const size_t targetTileSize = 16 * 1024;
+
+ std::lock_guard<std::mutex> lock(mQueueMutex);
+ assert(mTilesInProcess == 0);
+ mTilesNotYetStarted = task->setTiling(targetTileSize);
+ mWorkAvailableOrStop.notify_all();
+}
+
+void TaskProcessor::waitForPoolWorkersToComplete() {
+ std::unique_lock<std::mutex> lock(mQueueMutex);
+ // The predicate, i.e. the lambda, will make sure that
+ // we terminate even if the main thread calls this after
+ // mWorkIsFinished is signaled.
+ mWorkIsFinished.wait(lock, [this]() REQUIRES(mQueueMutex) {
+ return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
+ });
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/TaskProcessor.h b/toolkit/TaskProcessor.h
new file mode 100644
index 0000000..4d274fa
--- /dev/null
+++ b/toolkit/TaskProcessor.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
+#define ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
+
+#include <android-base/thread_annotations.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+namespace android {
+namespace renderscript {
+
+/**
+ * Description of the data to be processed for one Toolkit method call, e.g. one blur or one
+ * blend operation.
+ *
+ * The data to be processed is a 2D array of cells. Each cell is a vector of 1 to 4 unsigned bytes.
+ * The most typical configuration is a 2D array of uchar4 used to represent RGBA images.
+ *
+ * This is a base class. There will be a subclass for each Toolkit op.
+ *
+ * Typical usage of a derived class would look like:
+ * BlurTask task(in, out, sizeX, sizeY, vectorSize, etc);
+ * processor->doTask(&task);
+ *
+ * The TaskProcessor should call setTiling() and setUsesSimd() once, before calling processTile().
+ * Other classes should not call setTiling(), setUsesSimd(), and processTile().
+ */
+class Task {
+ protected:
+ /**
+ * Number of cells in the X direction.
+ */
+ const size_t mSizeX;
+ /**
+ * Number of cells in the Y direction.
+ */
+ const size_t mSizeY;
+ /**
+ * Number of elements in a vector (cell). From 1-4.
+ */
+ const size_t mVectorSize;
+ /**
+ * Whether the task prefers the processData call to represent the work to be done as
+ * one line rather than a rectangle. This would be the case for work that don't involve
+ * vertical neighbors, e.g. blend or histogram. A task would prefer this to minimize the
+ * number of SIMD calls to make, i.e. have one call that covers all the rows.
+ *
+ * This setting will be used only when a tile covers the entire width of the data to be
+ * processed.
+ */
+ const bool mPrefersDataAsOneRow;
+ /**
+ * Whether the processor we're working on supports SIMD operations.
+ */
+ bool mUsesSimd = false;
+
+ private:
+ /**
+ * If not null, we'll process a subset of the whole 2D array. This specifies the restriction.
+ */
+ const struct Restriction* mRestriction;
+
+ /**
+ * We'll divide the work into rectangular tiles. See setTiling().
+ */
+
+ /**
+ * Size of a tile in the X direction, as a number of cells.
+ */
+ size_t mCellsPerTileX = 0;
+ /**
+ * Size of a tile in the Y direction, as a number of cells.
+ */
+ size_t mCellsPerTileY = 0;
+ /**
+ * Number of tiles per row of the restricted area we're working on.
+ */
+ size_t mTilesPerRow = 0;
+ /**
+ * Number of tiles per column of the restricted area we're working on.
+ */
+ size_t mTilesPerColumn = 0;
+
+ public:
+ /**
+ * Construct a task.
+ *
+ * sizeX and sizeY should be greater than 0. vectorSize should be between 1 and 4.
+ * The restriction should outlive this instance. The Toolkit validates the
+ * arguments so we won't do that again here.
+ */
+ Task(size_t sizeX, size_t sizeY, size_t vectorSize, bool prefersDataAsOneRow,
+ const Restriction* restriction)
+ : mSizeX{sizeX},
+ mSizeY{sizeY},
+ mVectorSize{vectorSize},
+ mPrefersDataAsOneRow{prefersDataAsOneRow},
+ mRestriction{restriction} {}
+ virtual ~Task() {}
+
+ void setUsesSimd(bool uses) { mUsesSimd = uses; }
+
+ /**
+ * Divide the work into a number of tiles that can be distributed to the various threads.
+ * A tile will be a rectangular region. To be robust, we'll want to handle regular cases
+ * like 400x300 but also unusual ones like 1x120000, 120000x1, 1x1.
+ *
+ * We have a target size for the tiles, which corresponds roughly to how much data a thread
+ * will want to process before checking for more work. If the target is set too low, we'll spend
+ * more time in synchronization. If it's too large, some cores may not be used as efficiently.
+ *
+ * This method returns the number of tiles.
+ *
+ * @param targetTileSizeInBytes Target size. Values less than 1000 will be treated as 1000.
+ */
+ int setTiling(unsigned int targetTileSizeInBytes);
+
+ /**
+ * This is called by the TaskProcessor to instruct the task to process a tile.
+ *
+ * @param threadIndex The index of the thread that's processing the tile.
+ * @param tileIndex The index of the tile to process.
+ */
+ void processTile(unsigned int threadIndex, size_t tileIndex);
+
+ private:
+ /**
+ * Call to the derived class to process the data bounded by the rectangle specified
+ * by (startX, startY) and (endX, endY). The end values are EXCLUDED. This rectangle
+ * will be contained with the restriction, if one is provided.
+ */
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) = 0;
+};
+
+/**
+ * There's one instance of the task processor for the Toolkit. This class owns the thread pool,
+ * and dispatches the tiles of work to the threads.
+ */
+class TaskProcessor {
+ /**
+ * Does this processor support SIMD-like instructions?
+ */
+ const bool mUsesSimd;
+ /**
+ * The number of separate threads we'll spawn. It's one less than the number of threads that
+ * do the work as the client thread that starts the work will also be used.
+ */
+ const unsigned int mNumberOfPoolThreads;
+ /**
+ * Ensures that only one task is done at a time.
+ */
+ std::mutex mTaskMutex;
+ /**
+ * Ensures consistent access to the shared queue state.
+ */
+ std::mutex mQueueMutex;
+ /**
+ * The thread pool workers.
+ */
+ std::vector<std::thread> mPoolThreads;
+ /**
+ * The task being processed, if any. We only do one task at a time. We could create a queue
+ * of tasks but using a mTaskMutex is sufficient for now.
+ */
+ Task* mCurrentTask GUARDED_BY(mTaskMutex) = nullptr;
+ /**
+ * Signals that the mPoolThreads should terminate.
+ */
+ bool mStopThreads GUARDED_BY(mQueueMutex) = false;
+ /**
+ * Signaled when work is available or the mPoolThreads need to shut down. mStopThreads is used
+ * to distinguish between the two.
+ */
+ std::condition_variable mWorkAvailableOrStop;
+ /**
+ * Signaled when the work for the task is finished.
+ */
+ std::condition_variable mWorkIsFinished;
+ /**
+ * A user task, e.g. a blend or a blur, is split into a number of tiles. When a thread starts
+ * working on a new tile, it uses this count to identify which tile to work on. The tile
+ * number is sufficient to determine the boundaries of the data to process.
+ *
+ * The number of tiles left to process.
+ */
+ int mTilesNotYetStarted GUARDED_BY(mQueueMutex) = 0;
+ /**
+ * The number of tiles currently being processed. Must not be greater than
+ * mNumberOfPoolThreads + 1.
+ */
+ int mTilesInProcess GUARDED_BY(mQueueMutex) = 0;
+
+ /**
+ * Determines how we'll tile the work and signals the thread pool of available work.
+ *
+ * @param task The task to be performed.
+ */
+ void startWork(Task* task) REQUIRES(mTaskMutex);
+
+ /**
+ * Tells the thread to start processing work off the queue.
+ *
+ * The flag is used for prevent the main thread from blocking forever if the work is
+ * so trivial that the worker threads complete the work before the main thread calls this
+ * method.
+ *
+ * @param threadIndex The index number (0..mNumberOfPoolThreads) this thread will referred by.
+ * @param returnWhenNoWork If there's no work, return immediately.
+ */
+ void processTilesOfWork(int threadIndex, bool returnWhenNoWork);
+
+ /**
+ * Wait for the pool workers to complete the work on the current task.
+ */
+ void waitForPoolWorkersToComplete();
+
+ public:
+ /**
+ * Create the processor.
+ *
+ * @param numThreads The total number of threads to use. If 0, we'll decided based on system
+ * properties.
+ */
+ explicit TaskProcessor(unsigned int numThreads = 0);
+
+ ~TaskProcessor();
+
+ /**
+ * Do the specified task. Returns only after the task has been completed.
+ */
+ void doTask(Task* task);
+
+ /**
+ * Some Tasks need to allocate temporary storage for each worker thread.
+ * This provides the number of threads.
+ */
+ unsigned int getNumberOfThreads() const { return mNumberOfPoolThreads + 1; }
+};
+
+} // namespace renderscript
+} // namespace android
+
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
diff --git a/toolkit/TestTaskProcessor.cpp b/toolkit/TestTaskProcessor.cpp
new file mode 100644
index 0000000..36a94f4
--- /dev/null
+++ b/toolkit/TestTaskProcessor.cpp
@@ -0,0 +1,105 @@
+#include <array>
+
+#include "TaskProcessor.h"
+
+/**
+ * Sets all entries of the buffer to a value that depends on its coordinate and a delta.
+ */
+class SimpleTask : public android::renderscript::Task {
+ uint8_t* mBuffer;
+ uint8_t mDelta;
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY);
+
+ public:
+ SimpleTask(uint8_t* buffer, size_t vectorSize, size_t sizeX, size_t sizeY, uint8_t delta)
+ : Task{sizeX, sizeY, vectorSize, false, nullptr}, mBuffer{buffer}, mDelta{delta} {}
+};
+
+/**
+ * Create a new value that's a function of the x, y coordinates and a delta.
+ */
+static uint8_t newValue(size_t x, size_t y, uint8_t delta) {
+ return (((x & 0xff) << 4) | (y & 0xff)) + delta;
+}
+
+void SimpleTask::processData(int /*threadIndex*/, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ for (size_t y = startY; y < endY; y++) {
+ for (size_t x = startX; x < endX; x++) {
+ size_t index = (y * mSizeX + x) * mVectorSize;
+ for (size_t i = 0; i < mVectorSize; i++) {
+ // Use add to make sure the opertion is only done once. This assumes
+ // the buffer starts set at 0.
+ mBuffer[index + i] += newValue(x, y, mDelta + i);
+ }
+ }
+ }
+}
+
+/**
+ * Returns true if all the entries of the vector are the expected value.
+ * Prints an error if not.
+ */
+bool verifyAllTheSame(const std::vector<uint8_t>& buffer, size_t vectorSize, size_t sizeX,
+ size_t sizeY, uint8_t delta) {
+ for (size_t y = 0; y < sizeY; y++) {
+ for (size_t x = 0; x < sizeX; x++) {
+ size_t index = (y * sizeX + x) * vectorSize;
+ for (size_t i = 0; i < vectorSize; i++) {
+ uint8_t expectedValue = newValue(x, y, delta + i);
+ if (buffer[index + i] != expectedValue) {
+ printf("Test Error at %zu, %zu. Expected %u found %u instead\n", x, y,
+ expectedValue, buffer[index + i]);
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+/**
+ * Create a buffer of the specified size, set each entry of that buffer
+ * to the specified value using TaskProcessor, and verify the results.
+ */
+void testOne(android::renderscript::TaskProcessor* processor, uint8_t delta, size_t vectorSize,
+ size_t sizeX, size_t sizeY) {
+ std::vector<uint8_t> buffer(sizeX * sizeY * vectorSize);
+
+ SimpleTask task{buffer.data(), vectorSize, sizeX, sizeY, delta};
+ processor->doTask(&task);
+
+ if (verifyAllTheSame(buffer, vectorSize, sizeX, sizeY, delta)) {
+ printf("Test %u: All good!\n", delta);
+ }
+}
+
+int main() {
+ std::vector<std::thread> testThreads;
+
+ // Test with multiple threads, to help find synchronization errors.
+ android::renderscript::TaskProcessor processorA(1);
+ android::renderscript::TaskProcessor processorB(4);
+ testThreads.emplace_back(testOne, &processorA, 1, 4, 30, 40);
+ testThreads.emplace_back(testOne, &processorB, 1, 4, 30, 40);
+ testThreads.emplace_back(testOne, &processorA, 2, 4, 800, 600);
+ testThreads.emplace_back(testOne, &processorB, 2, 4, 800, 600);
+ testThreads.emplace_back(testOne, &processorA, 3, 1, 123, 47);
+ testThreads.emplace_back(testOne, &processorB, 3, 1, 123, 47);
+ testThreads.emplace_back(testOne, &processorA, 5, 2, 5000, 8000);
+ testThreads.emplace_back(testOne, &processorB, 5, 2, 5000, 8000);
+ testThreads.emplace_back(testOne, &processorA, 6, 3, 26000, 1);
+ testThreads.emplace_back(testOne, &processorB, 6, 3, 26000, 1);
+ testThreads.emplace_back(testOne, &processorA, 7, 4, 1, 26000);
+ testThreads.emplace_back(testOne, &processorB, 7, 4, 1, 26000);
+ testThreads.emplace_back(testOne, &processorA, 8, 4, 1000, 1000);
+ testThreads.emplace_back(testOne, &processorB, 8, 4, 1000, 1000);
+ testThreads.emplace_back(testOne, &processorA, 9, 1, 1, 1);
+ testThreads.emplace_back(testOne, &processorB, 9, 1, 1, 1);
+
+ for (auto& thread : testThreads) {
+ thread.join();
+ }
+ return 0;
+}
diff --git a/toolkit/Utils.cpp b/toolkit/Utils.cpp
new file mode 100644
index 0000000..8ec9fbe
--- /dev/null
+++ b/toolkit/Utils.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Utils.h"
+
+#include <cpu-features.h>
+
+#include "RenderScriptToolkit.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Utils"
+
+bool cpuSupportsSimd() {
+ AndroidCpuFamily family = android_getCpuFamily();
+ uint64_t features = android_getCpuFeatures();
+
+ if (family == ANDROID_CPU_FAMILY_ARM && (features & ANDROID_CPU_ARM_FEATURE_NEON)) {
+ // ALOGI("Arm with Neon");
+ return true;
+ } else if (family == ANDROID_CPU_FAMILY_ARM64 && (features & ANDROID_CPU_ARM64_FEATURE_ASIMD)) {
+ // ALOGI("Arm64 with ASIMD");
+ return true;
+ } else if ((family == ANDROID_CPU_FAMILY_X86 || family == ANDROID_CPU_FAMILY_X86_64) &&
+ (features & ANDROID_CPU_X86_FEATURE_SSSE3)) {
+ // ALOGI("x86* with SSE3");
+ return true;
+ }
+ // ALOGI("Not simd");
+ return false;
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+bool validRestriction(const char* tag, size_t sizeX, size_t sizeY, const Restriction* restriction) {
+ if (restriction == nullptr) {
+ return true;
+ }
+ if (restriction->startX >= sizeX || restriction->endX > sizeX) {
+ ALOGE("%s. sizeX should be greater than restriction->startX and greater or equal to "
+ "restriction->endX. %zu, %zu, and %zu were provided respectively.",
+ tag, sizeX, restriction->startX, restriction->endY);
+ return false;
+ }
+ if (restriction->startY >= sizeY && restriction->endY > sizeY) {
+ ALOGE("%s. sizeY should be greater than restriction->startY and greater or equal to "
+ "restriction->endY. %zu, %zu, and %zu were provided respectively.",
+ tag, sizeY, restriction->startY, restriction->endY);
+ return false;
+ }
+ if (restriction->startX >= restriction->endX) {
+ ALOGE("%s. Restriction startX should be less than endX. "
+ "%zu and %zu were provided respectively.",
+ tag, restriction->startX, restriction->endX);
+ return false;
+ }
+ if (restriction->startY >= restriction->endY) {
+ ALOGE("%s. Restriction startY should be less than endY. "
+ "%zu and %zu were provided respectively.",
+ tag, restriction->startY, restriction->endY);
+ return false;
+ }
+ return true;
+}
+#endif
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/Utils.h b/toolkit/Utils.h
new file mode 100644
index 0000000..ff9eb43
--- /dev/null
+++ b/toolkit/Utils.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
+#define ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
+
+#include <android/log.h>
+
+namespace android {
+namespace renderscript {
+
+/* The Toolkit does not support floating point buffers but the original RenderScript Intrinsics
+ * did for some operations. That code was preserved and protected by
+ * ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT.
+ */
+// TODO: On final packaging, decide whether this should be define in the build file, and for which
+// config. #define ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+/* If we release the Toolkit as a C++ API, we'll want to enable validation at the C++ level
+ * by uncommenting this define.
+ *
+ * If we only have a Java/Kotlin API, the Kotlin layer does validation. We don't need to duplicate
+ * this effort.
+ */
+#define ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+
+#define ALOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
+#define ALOGW(...) __android_log_print(ANDROID_LOG_WARN, LOG_TAG, __VA_ARGS__)
+#define ALOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
+
+using uchar = unsigned char;
+using uint = unsigned int;
+using ushort = unsigned short;
+
+using uint8_t = uchar;
+using uint16_t = ushort;
+using uint32_t = uint;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+
+template <typename TO, typename TI>
+inline TO convert(TI i) {
+ // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
+ // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
+ return __builtin_convertvector(i, TO);
+}
+
+template <>
+inline uchar convert(float i) {
+ // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
+ // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
+ return (uchar)i;
+}
+
+template <>
+inline float convert(uchar i) {
+ // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
+ // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
+ return (float)i;
+}
+
+inline int4 clamp(int4 amount, int low, int high) {
+ int4 r;
+ r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+ r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+ r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+ r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+ return r;
+}
+
+inline float4 clamp(float4 amount, float low, float high) {
+ float4 r;
+ r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+ r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+ r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+ r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+ return r;
+}
+
+inline int2 clamp(int2 amount, int low, int high) {
+ int2 r;
+ r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+ r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+ return r;
+}
+
+inline float2 clamp(float2 amount, float low, float high) {
+ float2 r;
+ r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+ r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+ return r;
+}
+
+inline int clamp(int amount, int low, int high) {
+ return amount < low ? low : (amount > high ? high : amount);
+}
+
+inline float clamp(float amount, float low, float high) {
+ return amount < low ? low : (amount > high ? high : amount);
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+struct Restriction;
+
+bool validRestriction(const char* tag, size_t sizeX, size_t sizeY, const Restriction* restriction);
+#endif
+
+/**
+ * Returns true if the processor we're running on supports the SIMD instructions that are
+ * used in our assembly code.
+ */
+bool cpuSupportsSimd();
+
+inline size_t divideRoundingUp(size_t a, size_t b) {
+ return a / b + (a % b == 0 ? 0 : 1);
+}
+
+inline size_t paddedSize(size_t size) {
+ return size == 3 ? 4 : size;
+}
+
+} // namespace renderscript
+} // namespace android
+
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
diff --git a/toolkit/YuvToRgb.cpp b/toolkit/YuvToRgb.cpp
new file mode 100644
index 0000000..2da0f5c
--- /dev/null
+++ b/toolkit/YuvToRgb.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.YuvToRgb"
+
+namespace android {
+namespace renderscript {
+
+inline size_t roundUpTo16(size_t val) {
+ return (val + 15) & ~15;
+}
+
+class YuvToRgbTask : public Task {
+ uchar4* mOut;
+ size_t mCstep;
+ size_t mStrideY;
+ size_t mStrideU;
+ size_t mStrideV;
+ const uchar* mInY;
+ const uchar* mInU;
+ const uchar* mInV;
+
+ void kernel(uchar4* out, uint32_t xstart, uint32_t xend, uint32_t currentY);
+ // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+ virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+ size_t endY) override;
+
+ public:
+ YuvToRgbTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+ RenderScriptToolkit::YuvFormat format)
+ : Task{sizeX, sizeY, 4, false, nullptr}, mOut{reinterpret_cast<uchar4*>(output)} {
+ switch (format) {
+ case RenderScriptToolkit::YuvFormat::NV21:
+ mCstep = 2;
+ mStrideY = sizeX;
+ mStrideU = mStrideY;
+ mStrideV = mStrideY;
+ mInY = reinterpret_cast<const uchar*>(input);
+ mInV = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
+ mInU = mInV + 1;
+ break;
+ case RenderScriptToolkit::YuvFormat::YV12:
+ mCstep = 1;
+ mStrideY = roundUpTo16(sizeX);
+ mStrideU = roundUpTo16(mStrideY >> 1);
+ mStrideV = mStrideU;
+ mInY = reinterpret_cast<const uchar*>(input);
+ mInU = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
+ mInV = mInU + mStrideV * sizeY / 2;
+ break;
+ }
+ }
+};
+
+void YuvToRgbTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+ size_t endY) {
+ for (size_t y = startY; y < endY; y++) {
+ size_t offset = mSizeX * y + startX;
+ uchar4* out = mOut + offset;
+ kernel(out, startX, endX, y);
+ }
+}
+
+static uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
+ int16_t Y = ((int16_t)y) - 16;
+ int16_t U = ((int16_t)u) - 128;
+ int16_t V = ((int16_t)v) - 128;
+
+ short4 p;
+ p.x = (Y * 298 + V * 409 + 128) >> 8;
+ p.y = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
+ p.z = (Y * 298 + U * 516 + 128) >> 8;
+ p.w = 255;
+ if(p.x < 0) {
+ p.x = 0;
+ }
+ if(p.x > 255) {
+ p.x = 255;
+ }
+ if(p.y < 0) {
+ p.y = 0;
+ }
+ if(p.y > 255) {
+ p.y = 255;
+ }
+ if(p.z < 0) {
+ p.z = 0;
+ }
+ if(p.z > 255) {
+ p.z = 255;
+ }
+
+ return (uchar4){static_cast<uchar>(p.x), static_cast<uchar>(p.y),
+ static_cast<uchar>(p.z), static_cast<uchar>(p.w)};
+}
+
+extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
+ size_t xend);
+extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
+ size_t xend);
+extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v,
+ size_t xstart, size_t xend);
+
+void YuvToRgbTask::kernel(uchar4 *out, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+ //ALOGI("kernel out %p, xstart=%u, xend=%u, currentY=%u", out, xstart, xend, currentY);
+
+ const uchar *y = mInY + (currentY * mStrideY);
+ const uchar *v = mInV + ((currentY >> 1) * mStrideV);
+ const uchar *u = mInU + ((currentY >> 1) * mStrideU);
+
+ //ALOGI("pinY %p, pinV %p, pinU %p", pinY, pinV, pinU);
+
+ uint32_t x1 = xstart;
+ uint32_t x2 = xend;
+
+ /*
+ ALOGE("pinY, %p, Y, %p, currentY, %d, strideY, %zu", pinY, y, currentY, mStrideY);
+ ALOGE("pinU, %p, U, %p, currentY, %d, strideU, %zu", pinU, u, currentY, mStrideU);
+ ALOGE("pinV, %p, V, %p, currentY, %d, strideV, %zu", pinV, v, currentY, mStrideV);
+ ALOGE("dimX, %d, dimY, %d", cp->alloc->mHal.drvState.lod[0].dimX,
+ cp->alloc->mHal.drvState.lod[0].dimY);
+ ALOGE("info->dim.x, %d, info->dim.y, %d", info->dim.x, info->dim.y);
+ uchar* pinY = (uchar*)mInY;
+ uchar* pinU = (uchar*)mInU;
+ uchar* pinV = (uchar*)mInV;
+ ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+ "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+ pinY, pinY[0], pinY[1], pinY[2], pinY[3], pinY[4], pinY[5], pinY[6], pinY[7], pinY[8],
+ pinY[9], pinY[10], pinY[11], pinY[12], pinY[13], pinY[14], pinY[15]);
+ ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+ "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+ pinY, pinY[16], pinY[17], pinY[18], pinY[19], pinY[20], pinY[21], pinY[22], pinY[23],
+ pinY[24], pinY[25], pinY[26], pinY[27], pinY[28], pinY[29], pinY[30], pinY[31]);
+ ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+ "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+ pinY, pinY[32], pinY[33], pinY[34], pinY[35], pinY[36], pinY[37], pinY[38], pinY[39],
+ pinY[40], pinY[41], pinY[42], pinY[43], pinY[44], pinY[45], pinY[46], pinY[47]);
+
+ ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+ "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+ pinU, pinU[0], pinU[1], pinU[2], pinU[3], pinU[4], pinU[5], pinU[6], pinU[7], pinU[8],
+ pinU[9], pinU[10], pinU[11], pinU[12], pinU[13], pinU[14], pinU[15]);
+ ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+ "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+ pinU, pinU[16], pinU[17], pinU[18], pinU[19], pinU[20], pinU[21], pinU[22], pinU[23],
+ pinU[24], pinU[25], pinU[26], pinU[27], pinU[28], pinU[29], pinU[30], pinU[31]);
+ ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+ "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+ pinU, pinU[32], pinU[33], pinU[34], pinU[35], pinU[36], pinU[37], pinU[38], pinU[39],
+ pinU[40], pinU[41], pinU[42], pinU[43], pinU[44], pinU[45], pinU[46], pinU[47]);
+
+ ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+ "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+ pinV, pinV[0], pinV[1], pinV[2], pinV[3], pinV[4], pinV[5], pinV[6], pinV[7], pinV[8],
+ pinV[9], pinV[10], pinV[11], pinV[12], pinV[13], pinV[14], pinV[15]);
+ ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+ "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+ pinV, pinV[16], pinV[17], pinV[18], pinV[19], pinV[20], pinV[21], pinV[22], pinV[23],
+ pinV[24], pinV[25], pinV[26], pinV[27], pinV[28], pinV[29], pinV[30], pinV[31]);
+ ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+ "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+ pinV, pinV[32], pinV[33], pinV[34], pinV[35], pinV[36], pinV[37], pinV[38], pinV[39],
+ pinV[40], pinV[41], pinV[42], pinV[43], pinV[44], pinV[45], pinV[46], pinV[47]);
+ */
+
+ /* If we start on an odd pixel then deal with it here and bump things along
+ * so that subsequent code can carry on with even-odd pairing assumptions.
+ */
+ if((x1 & 1) && (x2 > x1)) {
+ int cx = (x1 >> 1) * mCstep;
+ *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
+ out++;
+ x1++;
+ }
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+ if((x2 > x1) && mUsesSimd) {
+ int32_t len = x2 - x1;
+ if (mCstep == 1) {
+ rsdIntrinsicYuv2_K(out, y, u, v, x1, x2);
+ x1 += len;
+ out += len;
+ } else if (mCstep == 2) {
+ // Check for proper interleave
+ intptr_t ipu = (intptr_t)u;
+ intptr_t ipv = (intptr_t)v;
+
+ if (ipu == (ipv + 1)) {
+ rsdIntrinsicYuv_K(out, y, v, x1, x2);
+ x1 += len;
+ out += len;
+ } else if (ipu == (ipv - 1)) {
+ rsdIntrinsicYuvR_K(out, y, u, x1, x2);
+ x1 += len;
+ out += len;
+ }
+ }
+ }
+#endif
+
+ if(x2 > x1) {
+ // ALOGE("y %i %i %i", currentY, x1, x2);
+ while(x1 < x2) {
+ int cx = (x1 >> 1) * mCstep;
+ *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
+ out++;
+ x1++;
+ *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
+ out++;
+ x1++;
+ }
+ }
+}
+
+void RenderScriptToolkit::yuvToRgb(const uint8_t* input, uint8_t* output, size_t sizeX,
+ size_t sizeY, YuvFormat format) {
+ YuvToRgbTask task(input, output, sizeX, sizeY, format);
+ processor->doTask(&task);
+}
+
+} // namespace renderscript
+} // namespace android
diff --git a/toolkit/YuvToRgb_advsimd.S b/toolkit/YuvToRgb_advsimd.S
new file mode 100644
index 0000000..bb4b7ae
--- /dev/null
+++ b/toolkit/YuvToRgb_advsimd.S
@@ -0,0 +1,377 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register. This macro will be called from within several different wrapper
+ * variants for different data layouts. Y data starts with the even and odd
+ * bytes split into the low parts of v8 and v9 respectively. U and V are in
+ * v10 and v11. Working constants are pre-loaded into v24-v31, and v3 and v7
+ * are pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern, regu=v10, regv=v11
+ /* v0 out R_lo / even R_lo accumulator
+ * v1 out G_lo / even G_lo accumulator
+ * v2 out B_lo / even B_lo accumulator
+ * v3 out A_lo / const 0xff*ff
+ * v4 out R_hi / even R_hi accumulator
+ * v5 out G_hi / even G_hi accumulator
+ * v6 out B_hi / even B_hi accumulator
+ * v7 out A_hi / const 0xff*ff
+ * v8 even Y / G_lo luma tmp
+ * v9 odd Y / G_lo luma tmp
+ * \regu in U
+ * \regv in V
+ * v12 R_lo luma tmp
+ * v13 B_lo luma tmp
+ * v14 R_hi luma tmp
+ * v15 B_hi luma tmp
+ * v16 odd R_lo accumulator
+ * v17 odd G_lo accumulator
+ * v18 odd B_lo accumulator
+ * v19 multiplier extra bits low
+ * v20 odd R_hi accumulator
+ * v21 odd G_hi accumulator
+ * v22 odd B_hi accumulator
+ * v23 multiplier extra bits high
+ * v24 constant 149
+ * v25 constant 50
+ * v26 constant 104
+ * v27 constant 204
+ * v28 constant 254
+ * v29 constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ * v30 constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ * v31 constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ */
+
+ umull v1.8h, v8.8b, v24.8b // g0 = y0 * 149
+ umull v17.8h, v9.8b, v24.8b // g1 = y1 * 149
+ umull2 v5.8h, v8.16b, v24.16b // g0_hi = y0_hi * 149
+ umull2 v21.8h, v9.16b, v24.16b // g1_hi = y1_hi * 149
+
+ umull v8.8h, \regu\().8b, v25.8b // g2 = u * 50 + v * 104
+ umlal v8.8h, \regv\().8b, v26.8b
+ umull2 v9.8h, \regu\().16b, v25.16b // g2_hi = u_hi * 50 + v_hi * 104
+ umlal2 v9.8h, \regv\().16b, v26.16b
+
+ ushr v19.16b, \regv\().16b, #1
+ uaddw v0.8h, v1.8h, v19.8b // r0 = g0 + (v >> 1)
+ uaddw v16.8h, v17.8h, v19.8b // r1 = g1 + (v >> 1)
+
+ uaddw2 v4.8h, v5.8h, v19.16b // r0_hi = g0_hi + (v_hi >> 1)
+ uaddw2 v20.8h, v21.8h, v19.16b // r1_hi = g1_hi + (v_hi >> 1)
+
+ ushll v19.8h, \regu\().8b, #2
+ ushll2 v23.8h, \regu\().16b, #2
+ add v2.8h, v1.8h, v19.8h // b0 = g0 + (u << 2)
+ add v18.8h, v17.8h, v19.8h // b1 = g1 + (u << 2)
+
+ add v6.8h, v5.8h, v23.8h // b0_hi = g0_hi + (u_hi << 2)
+ add v22.8h, v21.8h, v23.8h // b1_hi = g1_hi + (u_hi << 2)
+
+ umull v12.8h, \regv\().8b, v27.8b // r2 = v * 204
+ umull v13.8h, \regu\().8b, v28.8b // b2 = u * 254
+
+ umull2 v14.8h, \regv\().16b, v27.16b // r2_hi = v_hi * 204
+ umull2 v15.8h, \regu\().16b, v28.16b // b2_hi = u_hi * 254
+
+ uhadd v0.8h, v0.8h, v12.8h // r0 = (r0 + r2) >> 1
+ uhadd v16.8h, v16.8h, v12.8h // r1 = (r1 + r2) >> 1
+ uqadd v1.8h, v1.8h, v30.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uqadd v17.8h, v17.8h, v30.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uhadd v2.8h, v2.8h, v13.8h // b0 = (b0 + b2) >> 1
+ uhadd v18.8h, v18.8h, v13.8h // b1 = (b1 + b2) >> 1
+
+ uhadd v4.8h, v4.8h, v14.8h // r0_hi = (r0_hi + r2_hi) >> 1
+ uhadd v20.8h, v20.8h, v14.8h // r1_hi = (r1_hi + r2_hi) >> 1
+ uqadd v5.8h, v5.8h, v30.8h // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uqadd v21.8h, v21.8h, v30.8h // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uhadd v6.8h, v6.8h, v15.8h // b0_hi = (b0_hi + b2_hi) >> 1
+ uhadd v22.8h, v22.8h, v15.8h // b1_hi = (b1_hi + b2_hi) >> 1
+
+ uqsub v0.8h, v0.8h, v29.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v16.8h, v16.8h, v29.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)
+ uqsub v17.8h, v17.8h, v8.8h // g1 = satu16(g1 - g2)
+ uqsub v2.8h, v2.8h, v31.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ uqsub v18.8h, v18.8h, v31.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+ uqsub v4.8h, v4.8h, v29.8h // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v20.8h, v20.8h, v29.8h // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v5.8h, v5.8h, v9.8h // g0_hi = satu16(g0_hi - g2_hi)
+ uqsub v21.8h, v21.8h, v9.8h // g1_hi = satu16(g1_hi - g2_hi)
+ uqsub v6.8h, v6.8h, v31.8h // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ uqsub v22.8h, v22.8h, v31.8h // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+ uqrshrn v0.8b, v0.8h, #6
+ uqrshrn v16.8b, v16.8h, #6
+ uqrshrn v1.8b, v1.8h, #7
+ uqrshrn v17.8b, v17.8h, #7
+ uqrshrn v2.8b, v2.8h, #6
+ uqrshrn v18.8b, v18.8h, #6
+
+ uqrshrn v4.8b, v4.8h, #6
+ uqrshrn v20.8b, v20.8h, #6
+ uqrshrn v5.8b, v5.8h, #7
+ uqrshrn v21.8b, v21.8h, #7
+ uqrshrn v6.8b, v6.8h, #6
+ uqrshrn v22.8b, v22.8h, #6
+
+ zip1 v0.16b, v0.16b, v16.16b
+ zip1 v1.16b, v1.16b, v17.16b
+ zip1 v2.16b, v2.16b, v18.16b
+
+ zip1 v4.16b, v4.16b, v20.16b
+ zip1 v5.16b, v5.16b, v21.16b
+ zip1 v6.16b, v6.16b, v22.16b
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop. Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+ movi v24.16b, #149
+ movi v25.16b, #50
+ movi v26.16b, #104
+ movi v27.16b, #204
+ movi v28.16b, #254
+ mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ dup v29.8h, w5
+ mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ dup v30.8h, w5
+ mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ dup v31.8h, w5
+
+ movi v3.16b, #0xff
+ movi v7.16b, #0xff
+
+ subs x2, x2, #32
+ bhs 1f
+ b 2f
+
+ .align 4
+1: ld2 {v8.16b,v9.16b}, [x1], #32
+ .if \interleaved
+ ld2 {v10.16b,v11.16b}, [x3], #32
+ .else
+ ld1 {v10.16b}, [x3], #16
+ ld1 {v11.16b}, [x4], #16
+ .endif
+
+ .if \swapuv
+ \kernel regu=v11, regv=v10
+ .else
+ \kernel
+ .endif
+
+ subs x2, x2, #32
+
+ st4 {v0.16b - v3.16b}, [x0], #64
+ st4 {v4.16b - v7.16b}, [x0], #64
+
+ bhs 1b
+
+2: adds x2, x2, #32
+ beq 2f
+
+ /* To handle the tail portion of the data (something less than 32
+ * bytes) load small power-of-two chunks into working registers. It
+ * doesn't matter where they end up in the register; the same process
+ * will store them back out using the same positions and the
+ * interaction between neighbouring pixels is constrained to odd
+ * boundaries where the load operations don't interfere.
+ */
+ movi v8.8b, #0
+ movi v9.8b, #0
+ movi v10.8b, #0
+ movi v11.8b, #0
+
+ tbz x2, #4, 1f
+ ld1 {v9.16b}, [x1], #16
+ .if \interleaved
+ ld1 {v11.16b}, [x3], #16
+ .else
+ ld1 {v10.d}[1], [x3], #8
+ ld1 {v11.d}[1], [x4], #8
+ .endif
+1: tbz x2, #3, 1f
+ ld1 {v8.d}[1], [x1], #8
+ .if \interleaved
+ ld1 {v10.d}[1], [x3], #8
+ .else
+ ld1 {v10.s}[1], [x3], #4
+ ld1 {v11.s}[1], [x4], #4
+ .endif
+1: tbz x2, #2, 1f
+ ld1 {v8.s}[1], [x1], #4
+ .if \interleaved
+ ld1 {v10.s}[1], [x3], #4
+ .else
+ ld1 {v10.h}[1], [x3], #2
+ ld1 {v11.h}[1], [x4], #2
+ .endif
+1: tbz x2, #1, 1f
+ ld1 {v8.h}[1], [x1], #2
+ .if \interleaved
+ ld1 {v10.h}[1], [x3], #2
+ .else
+ ld1 {v10.b}[1], [x3], #1
+ ld1 {v11.b}[1], [x4], #1
+ .endif
+1: tbz x2, #0, 1f
+ ld1 {v8.b}[1], [x1], #1
+ .if \interleaved
+ ld1 {v10.h}[0], [x3], #2
+ .else
+ ld1 {v10.b}[0], [x3], #1
+ ld1 {v11.b}[0], [x4], #1
+ .endif
+
+ /* One small impediment in the process above is that some of the load
+ * operations can't perform byte-wise structure deinterleaving at the
+ * same time as loading only part of a register. So the data is loaded
+ * linearly and unpacked manually at this point if necessary.
+ */
+1: mov v12.16b, v8.16b
+ uzp1 v8.16b, v12.16b, v9.16b
+ uzp2 v9.16b, v12.16b, v9.16b
+ .if \interleaved
+ mov v12.16b, v10.16b
+ uzp1 v10.16b, v12.16b, v11.16b
+ uzp2 v11.16b, v12.16b, v11.16b
+ .endif
+
+ .if \swapuv
+ \kernel regu=v11, regv=v10
+ .else
+ \kernel
+ .endif
+
+ /* As above but with the output; structured stores for partial vectors
+ * aren't available, so the data is re-packed first and stored linearly.
+ */
+ zip1 v16.16b, v0.16b, v2.16b
+ zip2 v18.16b, v0.16b, v2.16b
+ zip1 v17.16b, v1.16b, v3.16b
+ zip2 v19.16b, v1.16b, v3.16b
+ zip1 v0.16b, v16.16b, v17.16b
+ zip2 v1.16b, v16.16b, v17.16b
+ zip1 v2.16b, v18.16b, v19.16b
+ zip2 v3.16b, v18.16b, v19.16b
+
+ /* Luckily v4-v7 don't need to be unzipped because the complete set of
+ * four and can be stored using st4. */
+
+ tbz x2, #4, 1f
+ st4 {v4.16b - v7.16b}, [x0], #64
+1: tbz x2, #3, 1f
+ st1 {v2.16b,v3.16b}, [x0], #32
+1: tbz x2, #2, 1f
+ st1 {v1.16b}, [x0], #16
+1: tbz x2, #1, 1f
+ st1 {v0.d}[1], [x0], #8
+1: tbz x2, #0, 2f
+ st1 {v0.s}[1], [x0], #4
+2:
+.endm
+
+
+/* void rsdIntrinsicYuv2_K(
+ * void *out, // x0
+ * void const *yin, // x1
+ * void const *uin, // x2
+ * void const *vin, // x3
+ * size_t xstart, // x4
+ * size_t xend); // x5
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+ lsr x6, x4, #1
+ add x0, x0, x4, LSL #2
+ add x1, x1, x4
+ add x4, x3, x6
+ add x3, x2, x6
+ sub x2, x5, x6, LSL #1
+
+ sub x6, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x6]
+
+ wrap_line yuvkern, 0
+
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicYuv2_K)
+
+/* void rsdIntrinsicYuv_K(
+ * void *out, // x0
+ * void const *yin, // x1
+ * void const *uvin, // x2
+ * size_t xstart, // x3
+ * size_t xend); // x4
+ */
+ENTRY(rsdIntrinsicYuv_K)
+ bic x5, x3, #1
+ add x0, x0, x5, LSL #2
+ add x1, x1, x5
+ add x3, x2, x5
+ sub x2, x4, x5
+
+ sub x5, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x5]
+
+ wrap_line yuvkern, 1, 1
+
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicYuv_K)
+
+/* void rsdIntrinsicYuvR_K(
+ * void *out, // x0
+ * void const *yin, // x1
+ * void const *uvin, // x2
+ * size_t xstart, // x3
+ * size_t xend); // x4
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+ bic x5, x3, #1
+ add x0, x0, x5, LSL #2
+ add x1, x1, x5
+ add x3, x2, x5
+ sub x2, x4, x5
+
+ sub x5, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x5]
+
+ wrap_line yuvkern, 1
+
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicYuvR_K)
diff --git a/toolkit/YuvToRgb_neon.S b/toolkit/YuvToRgb_neon.S
new file mode 100644
index 0000000..5c3bce4
--- /dev/null
+++ b/toolkit/YuvToRgb_neon.S
@@ -0,0 +1,298 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register. This macro will be called from within several different wrapper
+ * variants for different data layouts. Y data starts in q8, but with the even
+ * and odd bytes split into d16 and d17 respectively. U and V are in d20
+ * and d21. Working constants are pre-loaded into q13-q15, and q3 is
+ * pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern
+ vmov.i8 d15, #149
+
+ vmull.u8 q1, d16, d15 // g0 = y0 * 149
+ vmull.u8 q5, d17, d15 // g1 = y1 * 149
+
+ vmov.i8 d14, #50
+ vmov.i8 d15, #104
+ vmull.u8 q8, d20, d14 // g2 = u * 50 + v * 104
+ vmlal.u8 q8, d21, d15
+
+ vshr.u8 d14, d21, #1
+ vaddw.u8 q0, q1, d14 // r0 = y0 * 149 + (v >> 1)
+ vaddw.u8 q4, q5, d14 // r1 = y1 * 149 + (v >> 1)
+
+ vshll.u8 q7, d20, #2
+ vadd.u16 q2, q1, q7 // b0 = y0 * 149 + (u << 2)
+ vadd.u16 q6, q5, q7 // b1 = y1 * 149 + (u << 2)
+
+ vmov.i8 d14, #204
+ vmov.i8 d15, #254
+ vmull.u8 q11, d21, d14 // r2 = v * 204
+ vmull.u8 q12, d20, d15 // b2 = u * 254
+
+ vhadd.u16 q0, q11 // r0 = (r0 + r2) >> 1
+ vhadd.u16 q4, q11 // r1 = (r1 + r2) >> 1
+ vqadd.u16 q1, q14 // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ vqadd.u16 q5, q14 // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ vhadd.u16 q2, q12 // b0 = (b0 + b2) >> 1
+ vhadd.u16 q6, q12 // b1 = (b1 + b2) >> 1
+
+ vqsub.u16 q0, q13 // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ vqsub.u16 q4, q13 // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ vqsub.u16 q1, q8 // g0 = satu16(g0 - g2)
+ vqsub.u16 q5, q8 // g1 = satu16(g1 - g2)
+ vqsub.u16 q2, q15 // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ vqsub.u16 q6, q15 // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+ vqrshrn.u16 d0, q0, #6
+ vqrshrn.u16 d1, q1, #7
+ vqrshrn.u16 d2, q4, #6
+ vqrshrn.u16 d3, q5, #7
+ vqrshrn.u16 d4, q2, #6
+ vqrshrn.u16 d5, q6, #6
+
+ vzip.u8 q0, q1
+ vzip.u8 d4, d5
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop. Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+
+ movw r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ vdup.i16 q13, r5
+ movw r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ vdup.i16 q14, r5
+ movw r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ vdup.i16 q15, r5
+
+ vmov.i8 q3, #0xff
+
+ subs r2, #16
+ bhs 1f
+ b 2f
+
+ .align 4
+1: vld2.u8 {d16,d17}, [r1]!
+ pld [r1, #256]
+ .if \interleaved
+ vld2.u8 {d20,d21}, [r3]!
+ .if \swapuv
+ vswp d20, d21
+ .endif
+ pld [r3, #256]
+ .else
+ vld1.u8 d20, [r3]!
+ vld1.u8 d21, [r4]!
+ pld [r3, #128]
+ pld [r4, #128]
+ .endif
+
+ \kernel
+
+ subs r2, #16
+
+ vst4.u8 {d0,d2,d4,d6}, [r0]!
+ vst4.u8 {d1,d3,d5,d7}, [r0]!
+
+ bhs 1b
+
+2: adds r2, #16
+ beq 2f
+
+ /* To handle the tail portion of the data (something less than 16
+ * bytes) load small power-of-two chunks into working registers. It
+ * doesn't matter where they end up in the register; the same process
+ * will store them back out using the same positions and the
+ * interaction between neighbouring pixels is constrained to odd
+ * boundaries where the load operations don't interfere.
+ */
+ vmov.i8 q8, #0
+ vmov.i8 q10, #0
+
+ tst r2, #8
+ beq 1f
+ vld1.u8 d17, [r1]!
+ .if \interleaved
+ vld1.u8 d21, [r3]!
+ .else
+ vld1.u32 d20[1], [r3]!
+ vld1.u32 d21[1], [r4]!
+ .endif
+
+1: tst r2, #4
+ beq 1f
+ vld1.u32 d16[1], [r1]!
+ .if \interleaved
+ vld1.u32 d20[1], [r3]!
+ .else
+ vld1.u16 d20[1], [r3]!
+ vld1.u16 d21[1], [r4]!
+ .endif
+1: tst r2, #2
+ beq 1f
+ vld1.u16 d16[1], [r1]!
+ .if \interleaved
+ vld1.u16 d20[1], [r3]!
+ .else
+ vld1.u8 d20[1], [r3]!
+ vld1.u8 d21[1], [r4]!
+ .endif
+1: tst r2, #1
+ beq 1f
+ vld1.u8 d16[1], [r1]!
+ .if \interleaved
+ vld1.u16 d20[0], [r3]!
+ .else
+ vld1.u8 d20[0], [r3]!
+ vld1.u8 d21[0], [r4]!
+ .endif
+
+ /* One small impediment in the process above is that some of the load
+ * operations can't perform byte-wise structure deinterleaving at the
+ * same time as loading only part of a register. So the data is loaded
+ * linearly and unpacked manually at this point if necessary.
+ */
+1: vuzp.8 d16, d17
+ .if \interleaved
+ vuzp.8 d20, d21
+ .if \swapuv
+ vswp d20, d21
+ .endif
+ .endif
+
+ \kernel
+
+ /* As above but with the output; structured stores for partial vectors
+ * aren't available, so the data is re-packed first and stored linearly.
+ */
+ vzip.8 q0, q2
+ vzip.8 q1, q3
+ vzip.8 q0, q1
+ vzip.8 q2, q3
+
+1: tst r2, #8
+ beq 1f
+ vst1.u8 {d4,d5,d6,d7}, [r0]!
+
+1: tst r2, #4
+ beq 1f
+ vst1.u8 {d2,d3}, [r0]!
+1: tst r2, #2
+ beq 1f
+ vst1.u8 d1, [r0]!
+1: tst r2, #1
+ beq 2f
+ vst1.u32 d0[1], [r0]!
+2:
+.endm
+
+
+/* void rsdIntrinsicYuv2_K(
+ * void *out, // r0
+ * void const *yin, // r1
+ * void const *uin, // r2
+ * void const *vin, // r3
+ * size_t xstart, // [sp]
+ * size_t xend); // [sp+#4]
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+ push {r4,r5}
+ ldr r5, [sp, #8]
+ mov r4, r3
+ mov r3, r2
+ ldr r2, [sp, #12]
+
+ add r0, r5, LSL #2
+ add r1, r5
+ add r3, r5, LSR #1
+ add r4, r5, LSR #1
+ sub r2, r5
+
+ vpush {d8-d15}
+
+ wrap_line yuvkern, 0
+
+ vpop {d8-d15}
+ pop {r4,r5}
+ bx lr
+END(rsdIntrinsicYuv2_K)
+
+/* void rsdIntrinsicYuv_K(
+ * void *out, // r0
+ * void const *yin, // r1
+ * void const *uvin, // r2
+ * size_t xstart, // r3
+ * size_t xend); // [sp]
+ */
+ENTRY(rsdIntrinsicYuv_K)
+ push {r4,r5}
+ bic r4, r3, #1
+ add r3, r2, r4
+ ldr r2, [sp, #8]
+
+ add r0, r4, LSL #2
+ add r1, r4
+ sub r2, r4
+
+ vpush {d8-d15}
+
+ wrap_line yuvkern, 1, 1
+
+ vpop {d8-d15}
+ pop {r4,r5}
+ bx lr
+END(rsdIntrinsicYuv_K)
+
+/* void rsdIntrinsicYuvR_K(
+ * void *out, // r0
+ * void const *yin, // r1
+ * void const *uvin, // r2
+ * size_t xstart, // r3
+ * size_t xend); // [sp]
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+ push {r4,r5}
+ bic r4, r3, #1
+ add r3, r2, r4
+ ldr r2, [sp, #8]
+
+ add r0, r4, LSL #2
+ add r1, r4
+ sub r2, r4
+
+ vpush {d8-d15}
+
+ wrap_line yuvkern, 1
+
+ vpop {d8-d15}
+ pop {r4,r5}
+ bx lr
+END(rsdIntrinsicYuvR_K)
diff --git a/toolkit/java/Toolkit.kt b/toolkit/java/Toolkit.kt
new file mode 100644
index 0000000..438f241
--- /dev/null
+++ b/toolkit/java/Toolkit.kt
@@ -0,0 +1,1566 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.android.renderscript
+
+import android.graphics.Bitmap
+import java.lang.IllegalArgumentException
+
+// This string is used for error messages.
+private const val externalName = "RenderScript Toolkit"
+
+/**
+ * A collection of high-performance graphic utility functions like blur and blend.
+ *
+ * This toolkit provides ten image manipulation functions: blend, blur, color matrix, convolve,
+ * histogram, histogramDot, lut, lut3d, resize, and YUV to RGB. These functions execute
+ * multithreaded on the CPU.
+ *
+ * Most of the functions have two variants: one that manipulates Bitmaps, the other ByteArrays.
+ * For ByteArrays, you need to specify the width and height of the data to be processed, as
+ * well as the number of bytes per pixel. For most use cases, this will be 4.
+ *
+ * The Toolkit creates a thread pool that's used for processing the functions. The threads live
+ * for the duration of the application. They can be destroyed by calling the method shutdown().
+ *
+ * This library is thread safe. You can call methods from different poolThreads. The functions will
+ * execute sequentially.
+ *
+ * A native C++ version of this Toolkit is available. Check the RenderScriptToolkit.h file in the
+ * cpp directory.
+ *
+ * This toolkit can be used as a replacement for most RenderScript Intrinsic functions. Compared
+ * to RenderScript, it's simpler to use and more than twice as fast on the CPU. However RenderScript
+ * Intrinsics allow more flexibility for the type of allocation supported. In particular, this
+ * toolkit does not support allocations of floats.
+ */
+object Toolkit {
+ /**
+ * Blends a source buffer with the destination buffer.
+ *
+ * Blends a source buffer and a destination buffer, placing the result in the destination
+ * buffer. The blending is done pairwise between two corresponding RGBA values found in
+ * each buffer. The mode parameter specifies one of fifteen supported blending operations.
+ * See {@link BlendingMode}.
+ *
+ * A variant of this method is also available to blend Bitmaps.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The source and destination buffer must have the same dimensions. Both arrays should have
+ * a size greater or equal to sizeX * sizeY * 4. The buffers have a row-major layout.
+ *
+ * @param mode The specific blending operation to do.
+ * @param sourceArray The RGBA input buffer.
+ * @param destArray The destination buffer. Used for input and output.
+ * @param sizeX The width of both buffers, as a number of RGBA values.
+ * @param sizeY The height of both buffers, as a number of RGBA values.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ @JvmOverloads
+ fun blend(
+ mode: BlendingMode,
+ sourceArray: ByteArray,
+ destArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d? = null
+ ) {
+ require(sourceArray.size >= sizeX * sizeY * 4) {
+ "$externalName blend. sourceArray is too small for the given dimensions. " +
+ "$sizeX*$sizeY*4 < ${sourceArray.size}."
+ }
+ require(destArray.size >= sizeX * sizeY * 4) {
+ "$externalName blend. sourceArray is too small for the given dimensions. " +
+ "$sizeX*$sizeY*4 < ${sourceArray.size}."
+ }
+ validateRestriction("blend", sizeX, sizeY, restriction)
+
+ nativeBlend(nativeHandle, mode.value, sourceArray, destArray, sizeX, sizeY, restriction)
+ }
+
+ /**
+ * Blends a source bitmap with the destination bitmap.
+ *
+ * Blends a source bitmap and a destination bitmap, placing the result in the destination
+ * bitmap. The blending is done pairwise between two corresponding RGBA values found in
+ * each bitmap. The mode parameter specify one of fifteen supported blending operations.
+ * See {@link BlendingMode}.
+ *
+ * A variant of this method is available to blend ByteArrays.
+ *
+ * The bitmaps should have identical width and height, and have a config of ARGB_8888.
+ * Bitmaps with a stride different than width * vectorSize are not currently supported.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each bitmap. If provided, the range must be wholly contained with the dimensions
+ * of the bitmap.
+ *
+ * @param mode The specific blending operation to do.
+ * @param sourceBitmap The RGBA input buffer.
+ * @param destBitmap The destination buffer. Used for input and output.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ */
+ @JvmOverloads
+ fun blend(
+ mode: BlendingMode,
+ sourceBitmap: Bitmap,
+ destBitmap: Bitmap,
+ restriction: Range2d? = null
+ ) {
+ validateBitmap("blend", sourceBitmap)
+ validateBitmap("blend", destBitmap)
+ require(
+ sourceBitmap.width == destBitmap.width &&
+ sourceBitmap.height == destBitmap.height
+ ) {
+ "$externalName blend. Source and destination bitmaps should be the same size. " +
+ "${sourceBitmap.width}x${sourceBitmap.height} and " +
+ "${destBitmap.width}x${destBitmap.height} provided."
+ }
+ require(sourceBitmap.config == destBitmap.config) {
+ "RenderScript Toolkit blend. Source and destination bitmaps should have the same " +
+ "config. ${sourceBitmap.config} and ${destBitmap.config} provided."
+ }
+ validateRestriction("blend", sourceBitmap.width, sourceBitmap.height, restriction)
+
+ nativeBlendBitmap(nativeHandle, mode.value, sourceBitmap, destBitmap, restriction)
+ }
+
+ /**
+ * Blurs an image.
+ *
+ * Performs a Gaussian blur of an image and returns result in a ByteArray buffer. A variant of
+ * this method is available to blur Bitmaps.
+ *
+ * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
+ * accepts values between 1 and 25. Larger values create a more blurred effect but also
+ * take longer to compute. When the radius extends past the edge, the edge pixel will
+ * be used as replacement for the pixel that's out off boundary.
+ *
+ * Each input pixel can either be represented by four bytes (RGBA format) or one byte
+ * for the less common blurring of alpha channel only image.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY. NOTE: The output buffer will still be full size, with the
+ * section that's not blurred all set to 0. This is to stay compatible with RenderScript.
+ *
+ * The source buffer should be large enough for sizeX * sizeY * mVectorSize bytes. It has a
+ * row-major layout.
+ *
+ * @param inputArray The buffer of the image to be blurred.
+ * @param vectorSize Either 1 or 4, the number of bytes in each cell, i.e. A vs. RGBA.
+ * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+ * @param radius The radius of the pixels used to blur, a value from 1 to 25.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The blurred pixels, a ByteArray of size.
+ */
+ @JvmOverloads
+ fun blur(
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ radius: Int = 5,
+ restriction: Range2d? = null
+ ): ByteArray {
+ require(vectorSize == 1 || vectorSize == 4) {
+ "$externalName blur. The vectorSize should be 1 or 4. $vectorSize provided."
+ }
+ require(inputArray.size >= sizeX * sizeY * vectorSize) {
+ "$externalName blur. inputArray is too small for the given dimensions. " +
+ "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
+ }
+ require(radius in 1..25) {
+ "$externalName blur. The radius should be between 1 and 25. $radius provided."
+ }
+ validateRestriction("blur", sizeX, sizeY, restriction)
+
+ val outputArray = ByteArray(inputArray.size)
+ nativeBlur(
+ nativeHandle, inputArray, vectorSize, sizeX, sizeY, radius, outputArray, restriction
+ )
+ return outputArray
+ }
+
+ /**
+ * Blurs an image.
+ *
+ * Performs a Gaussian blur of a Bitmap and returns result as a Bitmap. A variant of
+ * this method is available to blur ByteArrays.
+ *
+ * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
+ * accepts values between 1 and 25. Larger values create a more blurred effect but also
+ * take longer to compute. When the radius extends past the edge, the edge pixel will
+ * be used as replacement for the pixel that's out off boundary.
+ *
+ * This method supports input Bitmap of config ARGB_8888 and ALPHA_8. Bitmaps with a stride
+ * different than width * vectorSize are not currently supported. The returned Bitmap has the
+ * same config.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
+ * section that's not blurred all set to 0. This is to stay compatible with RenderScript.
+ *
+ * @param inputBitmap The buffer of the image to be blurred.
+ * @param radius The radius of the pixels used to blur, a value from 1 to 25. Default is 5.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The blurred Bitmap.
+ */
+ @JvmOverloads
+ fun blur(inputBitmap: Bitmap, radius: Int = 5, restriction: Range2d? = null): Bitmap {
+ validateBitmap("blur", inputBitmap)
+ require(radius in 1..25) {
+ "$externalName blur. The radius should be between 1 and 25. $radius provided."
+ }
+ validateRestriction("blur", inputBitmap.width, inputBitmap.height, restriction)
+
+ val outputBitmap = createCompatibleBitmap(inputBitmap)
+ nativeBlurBitmap(nativeHandle, inputBitmap, outputBitmap, radius, restriction)
+ return outputBitmap
+ }
+
+ /**
+ * Identity matrix that can be passed to the {@link RenderScriptToolkit::colorMatrix} method.
+ *
+ * Using this matrix will result in no change to the pixel through multiplication although
+ * the pixel value can still be modified by the add vector, or transformed to a different
+ * format.
+ */
+ val identityMatrix: FloatArray
+ get() = floatArrayOf(
+ 1f, 0f, 0f, 0f,
+ 0f, 1f, 0f, 0f,
+ 0f, 0f, 1f, 0f,
+ 0f, 0f, 0f, 1f
+ )
+
+ /**
+ * Matrix to turn color pixels to a grey scale.
+ *
+ * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert an
+ * image from color to greyscale.
+ */
+ val greyScaleColorMatrix: FloatArray
+ get() = floatArrayOf(
+ 0.299f, 0.299f, 0.299f, 0f,
+ 0.587f, 0.587f, 0.587f, 0f,
+ 0.114f, 0.114f, 0.114f, 0f,
+ 0f, 0f, 0f, 1f
+ )
+
+ /**
+ * Matrix to convert RGB to YUV.
+ *
+ * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+ * first three bytes of each pixel from RGB to YUV. This leaves the last byte (the alpha
+ * channel) untouched.
+ *
+ * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+ * by this method.
+ */
+ val rgbToYuvMatrix: FloatArray
+ get() = floatArrayOf(
+ 0.299f, -0.14713f, 0.615f, 0f,
+ 0.587f, -0.28886f, -0.51499f, 0f,
+ 0.114f, 0.436f, -0.10001f, 0f,
+ 0f, 0f, 0f, 1f
+ )
+
+ /**
+ * Matrix to convert YUV to RGB.
+ *
+ * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+ * first three bytes of each pixel from YUV to RGB. This leaves the last byte (the alpha
+ * channel) untouched.
+ *
+ * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+ * by this method. Use {@link RenderScriptToolkit::yuvToRgb} to convert these buffers.
+ */
+ val yuvToRgbMatrix: FloatArray
+ get() = floatArrayOf(
+ 1f, 1f, 1f, 0f,
+ 0f, -0.39465f, 2.03211f, 0f,
+ 1.13983f, -0.5806f, 0f, 0f,
+ 0f, 0f, 0f, 1f
+ )
+
+ /**
+ * Transform an image using a color matrix.
+ *
+ * Converts a 2D array of vectors of unsigned bytes, multiplying each vectors by a 4x4 matrix
+ * and adding an optional vector.
+ *
+ * Each input vector is composed of 1-4 unsigned bytes. If less than 4 bytes, it's extended to
+ * 4, padding with zeroes. The unsigned bytes are converted from 0-255 to 0.0-1.0 floats
+ * before the multiplication is done.
+ *
+ * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
+ * If the output vector size is less than four, the unused channels are discarded.
+ *
+ * If addVector is not specified, a vector of zeroes is added, i.e. a noop.
+ *
+ * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+ *
+ * Check identityMatrix, greyScaleColorMatrix, rgbToYuvMatrix, and yuvToRgbMatrix for sample
+ * matrices. The YUV conversion may not work for all color spaces.
+ *
+ * @param inputArray The buffer of the image to be converted.
+ * @param inputVectorSize The number of bytes in each input cell, a value from 1 to 4.
+ * @param sizeX The width of both buffers, as a number of 1 to 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 1 to 4 byte cells.
+ * @param outputVectorSize The number of bytes in each output cell, a value from 1 to 4.
+ * @param matrix The 4x4 matrix to multiply, in row major format.
+ * @param addVector A vector of four floats that's added to the result of the multiplication.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The converted buffer.
+ */
+ @JvmOverloads
+ fun colorMatrix(
+ inputArray: ByteArray,
+ inputVectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ outputVectorSize: Int,
+ matrix: FloatArray,
+ addVector: FloatArray = floatArrayOf(0f, 0f, 0f, 0f),
+ restriction: Range2d? = null
+ ): ByteArray {
+ require(inputVectorSize in 1..4) {
+ "$externalName colorMatrix. The inputVectorSize should be between 1 and 4. " +
+ "$inputVectorSize provided."
+ }
+ require(outputVectorSize in 1..4) {
+ "$externalName colorMatrix. The outputVectorSize should be between 1 and 4. " +
+ "$outputVectorSize provided."
+ }
+ require(inputArray.size >= sizeX * sizeY * inputVectorSize) {
+ "$externalName colorMatrix. inputArray is too small for the given dimensions. " +
+ "$sizeX*$sizeY*$inputVectorSize < ${inputArray.size}."
+ }
+ require(matrix.size == 16) {
+ "$externalName colorMatrix. matrix should have 16 entries. ${matrix.size} provided."
+ }
+ require(addVector.size == 4) {
+ "$externalName colorMatrix. addVector should have 4 entries. " +
+ "${addVector.size} provided."
+ }
+ validateRestriction("colorMatrix", sizeX, sizeY, restriction)
+
+ val outputArray = ByteArray(sizeX * sizeY * paddedSize(outputVectorSize))
+ nativeColorMatrix(
+ nativeHandle, inputArray, inputVectorSize, sizeX, sizeY, outputArray, outputVectorSize,
+ matrix, addVector, restriction
+ )
+ return outputArray
+ }
+
+ /**
+ * Transform an image using a color matrix.
+ *
+ * Converts a bitmap, multiplying each RGBA value by a 4x4 matrix and adding an optional vector.
+ * Each byte of the RGBA is converted from 0-255 to 0.0-1.0 floats before the multiplication
+ * is done.
+ *
+ * Bitmaps with a stride different than width * vectorSize are not currently supported.
+ *
+ * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
+ *
+ * If addVector is not specified, a vector of zeroes is added, i.e. a noop.
+ *
+ * Check identityMatrix, greyScaleColorMatrix, rgbToYuvMatrix, and yuvToRgbMatrix for sample
+ * matrices. The YUV conversion may not work for all color spaces.
+ *
+ * @param inputBitmap The image to be converted.
+ * @param matrix The 4x4 matrix to multiply, in row major format.
+ * @param addVector A vector of four floats that's added to the result of the multiplication.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The converted buffer.
+ */
+ @JvmOverloads
+ fun colorMatrix(
+ inputBitmap: Bitmap,
+ matrix: FloatArray,
+ addVector: FloatArray = floatArrayOf(0f, 0f, 0f, 0f),
+ restriction: Range2d? = null
+ ): Bitmap {
+ validateBitmap("colorMatrix", inputBitmap)
+ require(matrix.size == 16) {
+ "$externalName colorMatrix. matrix should have 16 entries. ${matrix.size} provided."
+ }
+ require(addVector.size == 4) {
+ "$externalName colorMatrix. addVector should have 4 entries."
+ }
+ validateRestriction("colorMatrix", inputBitmap.width, inputBitmap.height, restriction)
+
+ val outputBitmap = createCompatibleBitmap(inputBitmap)
+ nativeColorMatrixBitmap(
+ nativeHandle,
+ inputBitmap,
+ outputBitmap,
+ matrix,
+ addVector,
+ restriction
+ )
+ return outputBitmap
+ }
+
+ /**
+ * Convolve a ByteArray.
+ *
+ * Applies a 3x3 or 5x5 convolution to the input array using the provided coefficients.
+ * A variant of this method is available to convolve Bitmaps.
+ *
+ * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
+ * The coefficients should be provided in row-major format.
+ *
+ * When the square extends past the edge, the edge values will be used as replacement for the
+ * values that's are off boundary.
+ *
+ * Each input cell can either be represented by one to four bytes. Each byte is multiplied
+ * and accumulated independently of the other bytes of the cell.
+ *
+ * An optional range parameter can be set to restrict the convolve operation to a rectangular
+ * subset of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY. NOTE: The output buffer will still be full size, with the
+ * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+ *
+ * The source array should be large enough for sizeX * sizeY * vectorSize bytes. It has a
+ * row-major layout. The output array will have the same dimensions.
+ *
+ * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+ *
+ * @param inputArray The buffer of the image to be blurred.
+ * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+ * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+ * @param coefficients A FloatArray of size 9 or 25, containing the multipliers.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The convolved array.
+ */
+ @JvmOverloads
+ fun convolve(
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ coefficients: FloatArray,
+ restriction: Range2d? = null
+ ): ByteArray {
+ require(vectorSize in 1..4) {
+ "$externalName convolve. The vectorSize should be between 1 and 4. " +
+ "$vectorSize provided."
+ }
+ require(inputArray.size >= sizeX * sizeY * vectorSize) {
+ "$externalName convolve. inputArray is too small for the given dimensions. " +
+ "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
+ }
+ require(coefficients.size == 9 || coefficients.size == 25) {
+ "$externalName convolve. Only 3x3 or 5x5 convolutions are supported. " +
+ "${coefficients.size} coefficients provided."
+ }
+ validateRestriction("convolve", sizeX, sizeY, restriction)
+
+ val outputArray = ByteArray(inputArray.size)
+ nativeConvolve(
+ nativeHandle,
+ inputArray,
+ vectorSize,
+ sizeX,
+ sizeY,
+ outputArray,
+ coefficients,
+ restriction
+ )
+ return outputArray
+ }
+
+ /**
+ * Convolve a Bitmap.
+ *
+ * Applies a 3x3 or 5x5 convolution to the input Bitmap using the provided coefficients.
+ * A variant of this method is available to convolve ByteArrays. Bitmaps with a stride different
+ * than width * vectorSize are not currently supported.
+ *
+ * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
+ * The coefficients should be provided in row-major format.
+ *
+ * Each input cell can either be represented by one to four bytes. Each byte is multiplied
+ * and accumulated independently of the other bytes of the cell.
+ *
+ * An optional range parameter can be set to restrict the convolve operation to a rectangular
+ * subset of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
+ * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+ *
+ * @param inputBitmap The image to be blurred.
+ * @param coefficients A FloatArray of size 9 or 25, containing the multipliers.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The convolved Bitmap.
+ */
+ @JvmOverloads
+ fun convolve(
+ inputBitmap: Bitmap,
+ coefficients: FloatArray,
+ restriction: Range2d? = null
+ ): Bitmap {
+ validateBitmap("convolve", inputBitmap)
+ require(coefficients.size == 9 || coefficients.size == 25) {
+ "$externalName convolve. Only 3x3 or 5x5 convolutions are supported. " +
+ "${coefficients.size} coefficients provided."
+ }
+ validateRestriction("convolve", inputBitmap, restriction)
+
+ val outputBitmap = createCompatibleBitmap(inputBitmap)
+ nativeConvolveBitmap(nativeHandle, inputBitmap, outputBitmap, coefficients, restriction)
+ return outputBitmap
+ }
+
+ /**
+ * Compute the histogram of an image.
+ *
+ * Tallies how many times each of the 256 possible values of a byte is found in the input.
+ * A variant of this method is available to do the histogram of a Bitmap.
+ *
+ * An input cell can be represented by one to four bytes. The tally is done independently
+ * for each of the bytes of the cell. Correspondingly, the returned IntArray will have
+ * 256 * vectorSize entries. The counts for value 0 are consecutive, followed by those for
+ * value 1, etc.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The source buffer should be large enough for sizeX * sizeY * vectorSize bytes. It has a
+ * row-major layout.
+ *
+ * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+ *
+ * @param inputArray The buffer of the image to be analyzed.
+ * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+ * @param sizeX The width of the input buffers, as a number of 1 to 4 byte cells.
+ * @param sizeY The height of the input buffers, as a number of 1 to 4 byte cells.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The resulting array of counts.
+ */
+ @JvmOverloads
+ fun histogram(
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d? = null
+ ): IntArray {
+ require(vectorSize in 1..4) {
+ "$externalName histogram. The vectorSize should be between 1 and 4. " +
+ "$vectorSize provided."
+ }
+ require(inputArray.size >= sizeX * sizeY * vectorSize) {
+ "$externalName histogram. inputArray is too small for the given dimensions. " +
+ "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
+ }
+ validateRestriction("histogram", sizeX, sizeY, restriction)
+
+ val outputArray = IntArray(256 * paddedSize(vectorSize))
+ nativeHistogram(
+ nativeHandle,
+ inputArray,
+ vectorSize,
+ sizeX,
+ sizeY,
+ outputArray,
+ restriction
+ )
+ return outputArray
+ }
+
+ /**
+ * Compute the histogram of an image.
+ *
+ * Tallies how many times each of the 256 possible values of a byte is found in the bitmap.
+ * This method supports Bitmaps of config ARGB_8888 and ALPHA_8.
+ *
+ * For ARGB_8888, the tally is done independently of the four bytes. Correspondingly, the
+ * returned IntArray will have 4 * 256 entries. The counts for value 0 are consecutive,
+ * followed by those for value 1, etc.
+ *
+ * For ALPHA_8, an IntArray of size 256 is returned.
+ *
+ * Bitmaps with a stride different than width * vectorSize are not currently supported.
+ *
+ * A variant of this method is available to do the histogram of a ByteArray.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * @param inputBitmap The bitmap to be analyzed.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The resulting array of counts.
+ */
+ @JvmOverloads
+ fun histogram(
+ inputBitmap: Bitmap,
+ restriction: Range2d? = null
+ ): IntArray {
+ validateBitmap("histogram", inputBitmap)
+ validateRestriction("histogram", inputBitmap, restriction)
+
+ val outputArray = IntArray(256 * vectorSize(inputBitmap))
+ nativeHistogramBitmap(nativeHandle, inputBitmap, outputArray, restriction)
+ return outputArray
+ }
+
+ /**
+ * Compute the histogram of the dot product of an image.
+ *
+ * This method supports cells of 1 to 4 bytes in length. For each cell of the array,
+ * the dot product of its bytes with the provided coefficients is computed. The resulting
+ * floating point value is converted to an unsigned byte and tallied in the histogram.
+ *
+ * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
+ * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
+ *
+ * Each coefficients must be >= 0 and their sum must be 1.0 or less. There must be the same
+ * number of coefficients as vectorSize.
+ *
+ * A variant of this method is available to do the histogram of a Bitmap.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The source buffer should be large enough for sizeX * sizeY * vectorSize bytes. The returned
+ * array will have 256 ints.
+ *
+ * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+ *
+ * @param inputArray The buffer of the image to be analyzed.
+ * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+ * @param sizeX The width of the input buffers, as a number of 1 to 4 byte cells.
+ * @param sizeY The height of the input buffers, as a number of 1 to 4 byte cells.
+ * @param coefficients The dot product multipliers. Size should equal vectorSize. Can be null.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The resulting vector of counts.
+ */
+ @JvmOverloads
+ fun histogramDot(
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ coefficients: FloatArray? = null,
+ restriction: Range2d? = null
+ ): IntArray {
+ require(vectorSize in 1..4) {
+ "$externalName histogramDot. The vectorSize should be between 1 and 4. " +
+ "$vectorSize provided."
+ }
+ require(inputArray.size >= sizeX * sizeY * vectorSize) {
+ "$externalName histogramDot. inputArray is too small for the given dimensions. " +
+ "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
+ }
+ validateHistogramDotCoefficients(coefficients, vectorSize)
+ validateRestriction("histogramDot", sizeX, sizeY, restriction)
+
+ val outputArray = IntArray(256)
+ val actualCoefficients = coefficients ?: floatArrayOf(0.299f, 0.587f, 0.114f, 0f)
+ nativeHistogramDot(
+ nativeHandle,
+ inputArray,
+ vectorSize,
+ sizeX,
+ sizeY,
+ outputArray,
+ actualCoefficients,
+ restriction
+ )
+ return outputArray
+ }
+
+ /**
+ * Compute the histogram of the dot product of an image.
+ *
+ * This method supports Bitmaps of config ARGB_8888 and ALPHA_8. For each pixel of the bitmap,
+ * the dot product of its bytes with the provided coefficients is computed. The resulting
+ * floating point value is converted to an unsigned byte and tallied in the histogram.
+ *
+ * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
+ * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
+ *
+ * Each coefficients must be >= 0 and their sum must be 1.0 or less. For ARGB_8888, four values
+ * must be provided; for ALPHA_8, one.
+ *
+ * Bitmaps with a stride different than width * vectorSize are not currently supported.
+ *
+ * A variant of this method is available to do the histogram of a ByteArray.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY.
+ *
+ * The returned array will have 256 ints.
+ *
+ * @param inputBitmap The bitmap to be analyzed.
+ * @param coefficients The one or four values used for the dot product. Can be null.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The resulting vector of counts.
+ */
+ @JvmOverloads
+ fun histogramDot(
+ inputBitmap: Bitmap,
+ coefficients: FloatArray? = null,
+ restriction: Range2d? = null
+ ): IntArray {
+ validateBitmap("histogramDot", inputBitmap)
+ validateHistogramDotCoefficients(coefficients, vectorSize(inputBitmap))
+ validateRestriction("histogramDot", inputBitmap, restriction)
+
+ val outputArray = IntArray(256)
+ val actualCoefficients = coefficients ?: floatArrayOf(0.299f, 0.587f, 0.114f, 0f)
+ nativeHistogramDotBitmap(
+ nativeHandle, inputBitmap, outputArray, actualCoefficients, restriction
+ )
+ return outputArray
+ }
+
+ /**
+ * Transform an image using a look up table
+ *
+ * Transforms an image by using a per-channel lookup table. Each channel of the input has an
+ * independent lookup table. The tables are 256 entries in size and can cover the full value
+ * range of a byte.
+ *
+ * The input array should be in RGBA format, where four consecutive bytes form an cell.
+ * A variant of this method is available to transform a Bitmap.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
+ * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+ *
+ * The source array should be large enough for sizeX * sizeY * vectorSize bytes. The returned
+ * ray has the same dimensions as the input. The arrays have a row-major layout.
+ *
+ * @param inputArray The buffer of the image to be transformed.
+ * @param sizeX The width of both buffers, as a number of 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 4 byte cells.
+ * @param table The four arrays of 256 values that's used to convert each channel.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The transformed image.
+ */
+ @JvmOverloads
+ fun lut(
+ inputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ table: LookupTable,
+ restriction: Range2d? = null
+ ): ByteArray {
+ require(inputArray.size >= sizeX * sizeY * 4) {
+ "$externalName lut. inputArray is too small for the given dimensions. " +
+ "$sizeX*$sizeY*4 < ${inputArray.size}."
+ }
+ validateRestriction("lut", sizeX, sizeY, restriction)
+
+ val outputArray = ByteArray(inputArray.size)
+ nativeLut(
+ nativeHandle,
+ inputArray,
+ outputArray,
+ sizeX,
+ sizeY,
+ table.red,
+ table.green,
+ table.blue,
+ table.alpha,
+ restriction
+ )
+ return outputArray
+ }
+
+ /**
+ * Transform an image using a look up table
+ *
+ * Transforms an image by using a per-channel lookup table. Each channel of the input has an
+ * independent lookup table. The tables are 256 entries in size and can cover the full value
+ * range of a byte.
+ *
+ * The input Bitmap should be in config ARGB_8888. A variant of this method is available to
+ * transform a ByteArray. Bitmaps with a stride different than width * vectorSize are not
+ * currently supported.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
+ * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+ *
+ * @param inputBitmap The buffer of the image to be transformed.
+ * @param table The four arrays of 256 values that's used to convert each channel.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The transformed image.
+ */
+ @JvmOverloads
+ fun lut(
+ inputBitmap: Bitmap,
+ table: LookupTable,
+ restriction: Range2d? = null
+ ): Bitmap {
+ validateBitmap("lut", inputBitmap)
+ validateRestriction("lut", inputBitmap, restriction)
+
+ val outputBitmap = createCompatibleBitmap(inputBitmap)
+ nativeLutBitmap(
+ nativeHandle,
+ inputBitmap,
+ outputBitmap,
+ table.red,
+ table.green,
+ table.blue,
+ table.alpha,
+ restriction
+ )
+ return outputBitmap
+ }
+
+ /**
+ * Transform an image using a 3D look up table
+ *
+ * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
+ * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
+ * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
+ * is returned in the output array.
+ *
+ * The input array should be in RGBA format, where four consecutive bytes form an cell.
+ * The fourth byte of each input cell is ignored. A variant of this method is also available
+ * to transform Bitmaps.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY. NOTE: The output array will still be full size, with the
+ * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+ *
+ * The source array should be large enough for sizeX * sizeY * vectorSize bytes. The returned
+ * array will have the same dimensions. The arrays have a row-major layout.
+ *
+ * @param inputArray The buffer of the image to be transformed.
+ * @param sizeX The width of both buffers, as a number of 4 byte cells.
+ * @param sizeY The height of both buffers, as a number of 4 byte cells.
+ * @param cube The translation cube.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The transformed image.
+ */
+ @JvmOverloads
+ fun lut3d(
+ inputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ cube: Rgba3dArray,
+ restriction: Range2d? = null
+ ): ByteArray {
+ require(inputArray.size >= sizeX * sizeY * 4) {
+ "$externalName lut3d. inputArray is too small for the given dimensions. " +
+ "$sizeX*$sizeY*4 < ${inputArray.size}."
+ }
+ require(
+ cube.sizeX >= 2 && cube.sizeY >= 2 && cube.sizeZ >= 2 &&
+ cube.sizeX <= 256 && cube.sizeY <= 256 && cube.sizeZ <= 256
+ ) {
+ "$externalName lut3d. The dimensions of the cube should be between 2 and 256. " +
+ "(${cube.sizeX}, ${cube.sizeY}, ${cube.sizeZ}) provided."
+ }
+ validateRestriction("lut3d", sizeX, sizeY, restriction)
+
+ val outputArray = ByteArray(inputArray.size)
+ nativeLut3d(
+ nativeHandle, inputArray, outputArray, sizeX, sizeY, cube.values, cube.sizeX,
+ cube.sizeY, cube.sizeZ, restriction
+ )
+ return outputArray
+ }
+
+ /**
+ * Transform an image using a 3D look up table
+ *
+ * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
+ * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
+ * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
+ * is returned in the output array.
+ *
+ * The input bitmap should be in RGBA_8888 format. The A channel is preserved. A variant of this
+ * method is also available to transform ByteArray. Bitmaps with a stride different than
+ * width * vectorSize are not currently supported.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of each buffer. If provided, the range must be wholly contained with the dimensions
+ * described by sizeX and sizeY. NOTE: The output array will still be full size, with the
+ * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+ *
+ * The source array should be large enough for sizeX * sizeY * vectorSize bytes. The returned
+ * array will have the same dimensions. The arrays have a row-major layout.
+ *
+ * @param inputBitmap The image to be transformed.
+ * @param cube The translation cube.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return The transformed image.
+ */
+ @JvmOverloads
+ fun lut3d(
+ inputBitmap: Bitmap,
+ cube: Rgba3dArray,
+ restriction: Range2d? = null
+ ): Bitmap {
+ validateBitmap("lut3d", inputBitmap)
+ validateRestriction("lut3d", inputBitmap, restriction)
+
+ val outputBitmap = createCompatibleBitmap(inputBitmap)
+ nativeLut3dBitmap(
+ nativeHandle, inputBitmap, outputBitmap, cube.values, cube.sizeX,
+ cube.sizeY, cube.sizeZ, restriction
+ )
+ return outputBitmap
+ }
+
+ /**
+ * Resize an image.
+ *
+ * Resizes an image using bicubic interpolation.
+ *
+ * This method supports elements of 1 to 4 bytes in length. Each byte of the element is
+ * interpolated independently from the others.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of the output buffer. The corresponding scaled range of the input will be used. If provided,
+ * the range must be wholly contained with the dimensions described by outputSizeX and
+ * outputSizeY.
+ *
+ * The input and output arrays have a row-major layout. The input array should be
+ * large enough for sizeX * sizeY * vectorSize bytes.
+ *
+ * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+ *
+ * @param inputArray The buffer of the image to be resized.
+ * @param vectorSize The number of bytes in each element of both buffers. A value from 1 to 4.
+ * @param inputSizeX The width of the input buffer, as a number of 1-4 byte elements.
+ * @param inputSizeY The height of the input buffer, as a number of 1-4 byte elements.
+ * @param outputSizeX The width of the output buffer, as a number of 1-4 byte elements.
+ * @param outputSizeY The height of the output buffer, as a number of 1-4 byte elements.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return An array that contains the rescaled image.
+ */
+ @JvmOverloads
+ fun resize(
+ inputArray: ByteArray,
+ vectorSize: Int,
+ inputSizeX: Int,
+ inputSizeY: Int,
+ outputSizeX: Int,
+ outputSizeY: Int,
+ restriction: Range2d? = null
+ ): ByteArray {
+ require(vectorSize in 1..4) {
+ "$externalName resize. The vectorSize should be between 1 and 4. $vectorSize provided."
+ }
+ require(inputArray.size >= inputSizeX * inputSizeY * vectorSize) {
+ "$externalName resize. inputArray is too small for the given dimensions. " +
+ "$inputSizeX*$inputSizeY*$vectorSize < ${inputArray.size}."
+ }
+ validateRestriction("resize", outputSizeX, outputSizeY, restriction)
+
+ val outputArray = ByteArray(outputSizeX * outputSizeY * paddedSize(vectorSize))
+ nativeResize(
+ nativeHandle,
+ inputArray,
+ vectorSize,
+ inputSizeX,
+ inputSizeY,
+ outputArray,
+ outputSizeX,
+ outputSizeY,
+ restriction
+ )
+ return outputArray
+ }
+
+ /**
+ * Resize an image.
+ *
+ * Resizes an image using bicubic interpolation.
+ *
+ * This method supports input Bitmap of config ARGB_8888 and ALPHA_8. The returned Bitmap
+ * has the same config. Bitmaps with a stride different than width * vectorSize are not
+ * currently supported.
+ *
+ * An optional range parameter can be set to restrict the operation to a rectangular subset
+ * of the output buffer. The corresponding scaled range of the input will be used. If provided,
+ * the range must be wholly contained with the dimensions described by outputSizeX and
+ * outputSizeY.
+ *
+ * @param inputBitmap The Bitmap to be resized.
+ * @param outputSizeX The width of the output buffer, as a number of 1-4 byte elements.
+ * @param outputSizeY The height of the output buffer, as a number of 1-4 byte elements.
+ * @param restriction When not null, restricts the operation to a 2D range of pixels.
+ * @return A Bitmap that contains the rescaled image.
+ */
+ @JvmOverloads
+ fun resize(
+ inputBitmap: Bitmap,
+ outputSizeX: Int,
+ outputSizeY: Int,
+ restriction: Range2d? = null
+ ): Bitmap {
+ validateBitmap("resize", inputBitmap)
+ validateRestriction("resize", outputSizeX, outputSizeY, restriction)
+
+ val outputBitmap = Bitmap.createBitmap(outputSizeX, outputSizeY, Bitmap.Config.ARGB_8888)
+ nativeResizeBitmap(nativeHandle, inputBitmap, outputBitmap, restriction)
+ return outputBitmap
+ }
+
+ /**
+ * Convert an image from YUV to RGB.
+ *
+ * Converts a YUV buffer to RGB. The input array should be supplied in a supported YUV format.
+ * The output is RGBA; the alpha channel will be set to 255.
+ *
+ * Note that for YV12 and a sizeX that's not a multiple of 32, the RenderScript Intrinsic may
+ * not have converted the image correctly. This Toolkit method should.
+ *
+ * @param inputArray The buffer of the image to be converted.
+ * @param sizeX The width in pixels of the image.
+ * @param sizeY The height in pixels of the image.
+ * @param format Either YV12 or NV21.
+ * @return The converted image as a byte array.
+ */
+ fun yuvToRgb(inputArray: ByteArray, sizeX: Int, sizeY: Int, format: YuvFormat): ByteArray {
+ require(sizeX % 2 == 0 && sizeY % 2 == 0) {
+ "$externalName yuvToRgb. Non-even dimensions are not supported. " +
+ "$sizeX and $sizeY were provided."
+ }
+
+ val outputArray = ByteArray(sizeX * sizeY * 4)
+ nativeYuvToRgb(nativeHandle, inputArray, outputArray, sizeX, sizeY, format.value)
+ return outputArray
+ }
+
+ /**
+ * Convert an image from YUV to an RGB Bitmap.
+ *
+ * Converts a YUV buffer to an RGB Bitmap. The input array should be supplied in a supported
+ * YUV format. The output is RGBA; the alpha channel will be set to 255.
+ *
+ * Note that for YV12 and a sizeX that's not a multiple of 32, the RenderScript Intrinsic may
+ * not have converted the image correctly. This Toolkit method should.
+ *
+ * @param inputArray The buffer of the image to be converted.
+ * @param sizeX The width in pixels of the image.
+ * @param sizeY The height in pixels of the image.
+ * @param format Either YV12 or NV21.
+ * @return The converted image.
+ */
+ fun yuvToRgbBitmap(inputArray: ByteArray, sizeX: Int, sizeY: Int, format: YuvFormat): Bitmap {
+ require(sizeX % 2 == 0 && sizeY % 2 == 0) {
+ "$externalName yuvToRgbBitmap. Non-even dimensions are not supported. " +
+ "$sizeX and $sizeY were provided."
+ }
+
+ val outputBitmap = Bitmap.createBitmap(sizeX, sizeY, Bitmap.Config.ARGB_8888)
+ nativeYuvToRgbBitmap(nativeHandle, inputArray, sizeX, sizeY, outputBitmap, format.value)
+ return outputBitmap
+ }
+
+ init {
+ System.loadLibrary("renderscript-toolkit")
+ nativeHandle = createNative()
+ }
+
+ /**
+ * Shutdown the thread pool.
+ *
+ * Waits for the threads to complete their work and destroys them.
+ *
+ * An application should call this method only if it is sure that it won't call the
+ * toolkit again, as it is irreversible.
+ */
+ fun shutdown() {
+ destroyNative(nativeHandle)
+ nativeHandle = 0
+ }
+
+ private var nativeHandle: Long = 0
+
+ private external fun createNative(): Long
+
+ private external fun destroyNative(nativeHandle: Long)
+
+ private external fun nativeBlend(
+ nativeHandle: Long,
+ mode: Int,
+ sourceArray: ByteArray,
+ destArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d?
+ )
+
+ private external fun nativeBlendBitmap(
+ nativeHandle: Long,
+ mode: Int,
+ sourceBitmap: Bitmap,
+ destBitmap: Bitmap,
+ restriction: Range2d?
+ )
+
+ private external fun nativeBlur(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ radius: Int,
+ outputArray: ByteArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeBlurBitmap(
+ nativeHandle: Long,
+ inputBitmap: Bitmap,
+ outputBitmap: Bitmap,
+ radius: Int,
+ restriction: Range2d?
+ )
+
+ private external fun nativeColorMatrix(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ inputVectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ outputArray: ByteArray,
+ outputVectorSize: Int,
+ matrix: FloatArray,
+ addVector: FloatArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeColorMatrixBitmap(
+ nativeHandle: Long,
+ inputBitmap: Bitmap,
+ outputBitmap: Bitmap,
+ matrix: FloatArray,
+ addVector: FloatArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeConvolve(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ outputArray: ByteArray,
+ coefficients: FloatArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeConvolveBitmap(
+ nativeHandle: Long,
+ inputBitmap: Bitmap,
+ outputBitmap: Bitmap,
+ coefficients: FloatArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeHistogram(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ outputArray: IntArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeHistogramBitmap(
+ nativeHandle: Long,
+ inputBitmap: Bitmap,
+ outputArray: IntArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeHistogramDot(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ outputArray: IntArray,
+ coefficients: FloatArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeHistogramDotBitmap(
+ nativeHandle: Long,
+ inputBitmap: Bitmap,
+ outputArray: IntArray,
+ coefficients: FloatArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeLut(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ outputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ red: ByteArray,
+ green: ByteArray,
+ blue: ByteArray,
+ alpha: ByteArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeLutBitmap(
+ nativeHandle: Long,
+ inputBitmap: Bitmap,
+ outputBitmap: Bitmap,
+ red: ByteArray,
+ green: ByteArray,
+ blue: ByteArray,
+ alpha: ByteArray,
+ restriction: Range2d?
+ )
+
+ private external fun nativeLut3d(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ outputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ cube: ByteArray,
+ cubeSizeX: Int,
+ cubeSizeY: Int,
+ cubeSizeZ: Int,
+ restriction: Range2d?
+ )
+
+ private external fun nativeLut3dBitmap(
+ nativeHandle: Long,
+ inputBitmap: Bitmap,
+ outputBitmap: Bitmap,
+ cube: ByteArray,
+ cubeSizeX: Int,
+ cubeSizeY: Int,
+ cubeSizeZ: Int,
+ restriction: Range2d?
+ )
+
+ private external fun nativeResize(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ inputSizeX: Int,
+ inputSizeY: Int,
+ outputArray: ByteArray,
+ outputSizeX: Int,
+ outputSizeY: Int,
+ restriction: Range2d?
+ )
+
+ private external fun nativeResizeBitmap(
+ nativeHandle: Long,
+ inputBitmap: Bitmap,
+ outputBitmap: Bitmap,
+ restriction: Range2d?
+ )
+
+ private external fun nativeYuvToRgb(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ outputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ format: Int
+ )
+
+ private external fun nativeYuvToRgbBitmap(
+ nativeHandle: Long,
+ inputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ outputBitmap: Bitmap,
+ value: Int
+ )
+}
+
+/**
+ * Determines how a source buffer is blended into a destination buffer.
+ * See {@link RenderScriptToolkit::blend}.
+ *
+ * blend only works on 4 byte RGBA data. In the descriptions below, ".a" represents
+ * the alpha channel.
+ */
+enum class BlendingMode(val value: Int) {
+ /**
+ * dest = 0
+ *
+ * The destination is cleared, i.e. each pixel is set to (0, 0, 0, 0)
+ */
+ CLEAR(0),
+
+ /**
+ * dest = src
+ *
+ * Sets each pixel of the destination to the corresponding one in the source.
+ */
+ SRC(1),
+
+ /**
+ * dest = dest
+ *
+ * Leaves the destination untouched. This is a no-op.
+ */
+ DST(2),
+
+ /**
+ * dest = src + dest * (1.0 - src.a)
+ */
+ SRC_OVER(3),
+
+ /**
+ * dest = dest + src * (1.0 - dest.a)
+ */
+ DST_OVER(4),
+
+ /**
+ * dest = src * dest.a
+ */
+ SRC_IN(5),
+
+ /**
+ * dest = dest * src.a
+ */
+ DST_IN(6),
+
+ /**
+ * dest = src * (1.0 - dest.a)
+ */
+ SRC_OUT(7),
+
+ /**
+ * dest = dest * (1.0 - src.a)
+ */
+ DST_OUT(8),
+
+ /**
+ * dest.rgb = src.rgb * dest.a + (1.0 - src.a) * dest.rgb, dest.a = dest.a
+ */
+ SRC_ATOP(9),
+
+ /**
+ * dest = dest.rgb * src.a + (1.0 - dest.a) * src.rgb, dest.a = src.a
+ */
+ DST_ATOP(10),
+
+ /**
+ * dest = {src.r ^ dest.r, src.g ^ dest.g, src.b ^ dest.b, src.a ^ dest.a}
+ *
+ * Note: this is NOT the Porter/Duff XOR mode; this is a bitwise xor.
+ */
+ XOR(11),
+
+ /**
+ * dest = src * dest
+ */
+ MULTIPLY(12),
+
+ /**
+ * dest = min(src + dest, 1.0)
+ */
+ ADD(13),
+
+ /**
+ * dest = max(dest - src, 0.0)
+ */
+ SUBTRACT(14)
+}
+
+/**
+ * A translation table used by the lut method. For each potential red, green, blue, and alpha
+ * value, specifies it's replacement value.
+ *
+ * The fields are initialized to be a no-op operation, i.e. replace 1 by 1, 2 by 2, etc.
+ * You can modify just the values you're interested in having a translation.
+ */
+class LookupTable {
+ var red = ByteArray(256) { it.toByte() }
+ var green = ByteArray(256) { it.toByte() }
+ var blue = ByteArray(256) { it.toByte() }
+ var alpha = ByteArray(256) { it.toByte() }
+}
+
+/**
+ * The YUV formats supported by yuvToRgb.
+ */
+enum class YuvFormat(val value: Int) {
+ NV21(0x11),
+ YV12(0x32315659),
+}
+
+/**
+ * Define a range of data to process.
+ *
+ * This class is used to restrict a [Toolkit] operation to a rectangular subset of the input
+ * tensor.
+ *
+ * @property startX The index of the first value to be included on the X axis.
+ * @property endX The index after the last value to be included on the X axis.
+ * @property startY The index of the first value to be included on the Y axis.
+ * @property endY The index after the last value to be included on the Y axis.
+ */
+data class Range2d(
+ val startX: Int,
+ val endX: Int,
+ val startY: Int,
+ val endY: Int
+) {
+ constructor() : this(0, 0, 0, 0)
+}
+
+class Rgba3dArray(val values: ByteArray, val sizeX: Int, val sizeY: Int, val sizeZ: Int) {
+ init {
+ require(values.size >= sizeX * sizeY * sizeZ * 4)
+ }
+
+ operator fun get(x: Int, y: Int, z: Int): ByteArray {
+ val index = indexOfVector(x, y, z)
+ return ByteArray(4) { values[index + it] }
+ }
+
+ operator fun set(x: Int, y: Int, z: Int, value: ByteArray) {
+ require(value.size == 4)
+ val index = indexOfVector(x, y, z)
+ for (i in 0..3) {
+ values[index + i] = value[i]
+ }
+ }
+
+ private fun indexOfVector(x: Int, y: Int, z: Int): Int {
+ require(x in 0 until sizeX)
+ require(y in 0 until sizeY)
+ require(z in 0 until sizeZ)
+ return ((z * sizeY + y) * sizeX + x) * 4
+ }
+}
+
+internal fun validateBitmap(
+ function: String,
+ inputBitmap: Bitmap,
+ alphaAllowed: Boolean = true
+) {
+ if (alphaAllowed) {
+ require(
+ inputBitmap.config == Bitmap.Config.ARGB_8888 ||
+ inputBitmap.config == Bitmap.Config.ALPHA_8
+ ) {
+ "$externalName. $function supports only ARGB_8888 and ALPHA_8 bitmaps. " +
+ "${inputBitmap.config} provided."
+ }
+ } else {
+ require(inputBitmap.config == Bitmap.Config.ARGB_8888) {
+ "$externalName. $function supports only ARGB_8888. " +
+ "${inputBitmap.config} provided."
+ }
+ }
+ require(inputBitmap.width * vectorSize(inputBitmap) == inputBitmap.rowBytes) {
+ "$externalName $function. Only bitmaps with rowSize equal to the width * vectorSize are " +
+ "currently supported. Provided were rowBytes=${inputBitmap.rowBytes}, " +
+ "width={${inputBitmap.width}, and vectorSize=${vectorSize(inputBitmap)}."
+ }
+}
+
+internal fun createCompatibleBitmap(inputBitmap: Bitmap) =
+ Bitmap.createBitmap(inputBitmap.width, inputBitmap.height, inputBitmap.config)
+
+internal fun validateHistogramDotCoefficients(
+ coefficients: FloatArray?,
+ vectorSize: Int
+) {
+ require(coefficients == null || coefficients.size == vectorSize) {
+ "$externalName histogramDot. The coefficients should be null or have $vectorSize values."
+ }
+ if (coefficients !== null) {
+ var sum = 0f
+ for (i in 0 until vectorSize) {
+ require(coefficients[i] >= 0.0f) {
+ "$externalName histogramDot. Coefficients should not be negative. " +
+ "Coefficient $i was ${coefficients[i]}."
+ }
+ sum += coefficients[i]
+ }
+ require(sum <= 1.0f) {
+ "$externalName histogramDot. Coefficients should add to 1 or less. Their sum is $sum."
+ }
+ }
+}
+
+internal fun validateRestriction(tag: String, bitmap: Bitmap, restriction: Range2d? = null) {
+ validateRestriction(tag, bitmap.width, bitmap.height, restriction)
+}
+
+internal fun validateRestriction(
+ tag: String,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d? = null
+) {
+ if (restriction == null) return
+ require(restriction.startX < sizeX && restriction.endX <= sizeX) {
+ "$externalName $tag. sizeX should be greater than restriction.startX and greater " +
+ "or equal to restriction.endX. $sizeX, ${restriction.startX}, " +
+ "and ${restriction.endX} were provided respectively."
+ }
+ require(restriction.startY < sizeY && restriction.endY <= sizeY) {
+ "$externalName $tag. sizeY should be greater than restriction.startY and greater " +
+ "or equal to restriction.endY. $sizeY, ${restriction.startY}, " +
+ "and ${restriction.endY} were provided respectively."
+ }
+ require(restriction.startX < restriction.endX) {
+ "$externalName $tag. Restriction startX should be less than endX. " +
+ "${restriction.startX} and ${restriction.endX} were provided respectively."
+ }
+ require(restriction.startY < restriction.endY) {
+ "$externalName $tag. Restriction startY should be less than endY. " +
+ "${restriction.startY} and ${restriction.endY} were provided respectively."
+ }
+}
+
+internal fun vectorSize(bitmap: Bitmap): Int {
+ return when (bitmap.config) {
+ Bitmap.Config.ARGB_8888 -> 4
+ Bitmap.Config.ALPHA_8 -> 1
+ else -> throw IllegalArgumentException(
+ "$externalName. Only ARGB_8888 and ALPHA_8 Bitmap are supported."
+ )
+ }
+}
+
+internal fun paddedSize(vectorSize: Int) = if (vectorSize == 3) 4 else vectorSize
diff --git a/toolkit/test/AllTests.kt b/toolkit/test/AllTests.kt
new file mode 100644
index 0000000..5833795
--- /dev/null
+++ b/toolkit/test/AllTests.kt
@@ -0,0 +1,1244 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO Rename to something better
+package com.example.testapp
+
+import android.content.Context
+import android.graphics.Bitmap
+import android.graphics.BitmapFactory
+import android.renderscript.RenderScript
+import android.renderscript.toolkit.BlendingMode
+import android.renderscript.toolkit.LookupTable
+import android.renderscript.toolkit.Range2d
+import android.renderscript.toolkit.Rgba3dArray
+import android.renderscript.toolkit.Toolkit
+import android.renderscript.toolkit.YuvFormat
+import kotlin.math.abs
+import kotlin.math.min
+
+data class TestLayout(
+ val sizeX: Int,
+ val sizeY: Int,
+ val restriction: Range2d?
+)
+
+// List of dimensions (sizeX, sizeY) to try when generating random data.
+val commonLayoutsToTry = listOf(
+ // Small layouts to start with
+ TestLayout(3, 4, null),
+ TestLayout(3, 4, Range2d(0, 1, 0, 3)),
+ TestLayout(3, 4, Range2d(2, 3, 1, 4)),
+ TestLayout(10, 14, null),
+ TestLayout(10, 14, Range2d(2, 3, 8, 14)),
+ // The size of most CTS intrinsic tests
+ TestLayout(160, 100, null),
+ TestLayout(125, 227, Range2d(50, 125, 100, 227)),
+ // A larger one
+ TestLayout(800, 600, null),
+ // Weirdly shaped ones
+ TestLayout(1, 1, null), // A single item
+ // TODO This size makes Intrinsic Blur fail.
+ TestLayout(16000, 1, null), // A single item
+ TestLayout(1, 16000, null), // One large row
+ // A very large test
+ TestLayout(1024, 2048, null),
+)
+
+
+class Tester(context: Context, private val validate: Boolean) {
+ private val renderscriptContext = RenderScript.create(context)
+ private val toolkit = Toolkit()
+ private val testImage1 = BitmapFactory.decodeResource(context.resources, R.drawable.img800x450a)
+ private val testImage2 = BitmapFactory.decodeResource(context.resources, R.drawable.img800x450b)
+
+ init {
+ validateTestImage(testImage1)
+ validateTestImage(testImage2)
+ }
+
+ /**
+ * Verify that the test images are in format that works for our tests.
+ */
+ private fun validateTestImage(bitmap: Bitmap) {
+ require(bitmap.config == Bitmap.Config.ARGB_8888)
+ require(bitmap.rowBytes == bitmap.width * 4) {
+ "Can't handle bitmaps that have extra padding. " +
+ "${bitmap.rowBytes} != ${bitmap.width} * 4." }
+ require(bitmap.byteCount == bitmap.rowBytes * bitmap.height)
+ }
+
+ fun destroy() {
+ renderscriptContext.destroy()
+ }
+
+ @ExperimentalUnsignedTypes
+ fun testAll(timer: TimingTracker): String {
+ val tests = listOf(
+ Pair("blend", ::testBlend),
+ Pair("blur", ::testBlur),
+ Pair("colorMatrix", ::testColorMatrix),
+ Pair("convolve", ::testConvolve),
+ Pair("histogram", ::testHistogram),
+ Pair("lut", ::testLut),
+ Pair("lut3d", ::testLut3d),
+ Pair("resize", ::testResize),
+ Pair("yuvToRgb", ::testYuvToRgb),
+ )
+ val results = Array(tests.size) { "" }
+ for (i in tests.indices) {
+ val (name, test) = tests[i]
+ println("Doing $name")
+ val success = test(timer)
+ results[i] = "$name " + if (success) "succeeded" else "FAILED! FAILED! FAILED! FAILED!"
+ println(" ${results[i]}")
+ }
+
+ return results.joinToString("\n")
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testBlend(timer: TimingTracker): Boolean {
+ return BlendingMode.values().all { mode ->
+ testOneBitmapBlend(timer, testImage1, testImage2, mode, null) and
+ testOneBitmapBlend(
+ timer, testImage1, testImage2, mode,
+ Range2d(6, 23, 2, 4)
+ ) and
+ commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+ testOneRandomBlend(timer, sizeX, sizeY, mode, restriction)
+ }
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomBlend(
+ timer: TimingTracker,
+ sizeX: Int,
+ sizeY: Int,
+ mode: BlendingMode,
+ restriction: Range2d?
+ ): Boolean {
+ val sourceArray = randomByteArray(0x50521f0, sizeX, sizeY, 4)
+ val destArray = randomByteArray(0x2932147, sizeX, sizeY, 4)
+ // Make clones because these will be modified by the blend.
+ val intrinsicDestArray = destArray.clone()
+ val referenceDestArray = destArray.clone()
+ val toolkitDestArray = destArray.clone()
+
+ timer.measure("IntrinsicBlend") {
+ intrinsicBlend(
+ renderscriptContext, mode, sourceArray, intrinsicDestArray, sizeX, sizeY,
+ restriction
+ )
+ }
+ timer.measure("ToolkitBlend") {
+ toolkit.blend(mode, sourceArray, toolkitDestArray, sizeX, sizeY, restriction)
+ }
+ if (!validate) return true
+
+ timer.measure("ReferenceBlend") {
+ referenceBlend(mode, sourceArray, referenceDestArray, sizeX, sizeY, restriction)
+ }
+
+ return validateSame(
+ "Blend_$mode", intrinsicDestArray, referenceDestArray, toolkitDestArray
+ ) {
+ println("blend $mode ($sizeX, $sizeY) $restriction")
+ logArray("Blend_$mode src", sourceArray, 48)
+ logArray("Blend_$mode dst", destArray, 48)
+ logArray("Blend_$mode reference out", referenceDestArray, 48)
+ logArray("Blend_$mode intrinsic out", intrinsicDestArray, 48)
+ logArray("Blend_$mode toolkit out", toolkitDestArray, 48)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneBitmapBlend(
+ timer: TimingTracker,
+ sourceBitmap: Bitmap,
+ destBitmap: Bitmap,
+ mode: BlendingMode,
+ restriction: Range2d?
+ ): Boolean {
+ // Make clones because these will be modified by the blend.
+ val intrinsicDestBitmap = duplicateBitmap(destBitmap)
+ val toolkitDestBitmap = duplicateBitmap(destBitmap)
+ val referenceDestBitmap = duplicateBitmap(destBitmap)
+
+ timer.measure("IntrinsicBlend") {
+ intrinsicBlend(
+ renderscriptContext, mode, sourceBitmap, intrinsicDestBitmap, restriction
+ )
+ }
+ timer.measure("ToolkitBlend") {
+ toolkit.blend(mode, sourceBitmap, toolkitDestBitmap, restriction)
+ }
+ if (!validate) return true
+
+ val referenceDestArray = getBitmapBytes(referenceDestBitmap)
+ timer.measure("ReferenceBlend") {
+ referenceBlend(
+ mode, getBitmapBytes(sourceBitmap), referenceDestArray, sourceBitmap.width,
+ sourceBitmap.height, restriction
+ )
+ }
+
+ val intrinsicDestArray = getBitmapBytes(intrinsicDestBitmap)
+ val toolkitDestArray = getBitmapBytes(toolkitDestBitmap)
+ return validateSame(
+ "BlendBitmap_$mode", intrinsicDestArray, referenceDestArray, toolkitDestArray
+ ) {
+ println("BlendBitmap $mode $restriction")
+ //logArray("BlendBitmap_$mode src", sourceArray, 48)
+ //logArray("BlendBitmap_$mode dst", destArray, 48)
+ logArray("BlendBitmap_$mode reference out", referenceDestArray, 48)
+ logArray("BlendBitmap_$mode intrinsic out", intrinsicDestArray, 48)
+ logArray("BlendBitmap_$mode toolkit out", toolkitDestArray, 48)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testBlur(timer: TimingTracker): Boolean {
+ return arrayOf(1, 3, 8, 25).all { radius ->
+ testOneBitmapBlur(timer, testImage1, radius, null) and
+ testOneBitmapBlur(timer, testImage1, radius, Range2d(6, 23, 2, 4)) and
+ commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+ arrayOf(1, 4).all { vectorSize ->
+ testOneRandomBlur(timer, vectorSize, sizeX, sizeY, radius, restriction)
+ }
+ }
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomBlur(
+ timer: TimingTracker,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ radius: Int,
+ restriction: Range2d?
+ ): Boolean {
+ val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, vectorSize)
+ val intrinsicOutArray = timer.measure("IntrinsicBlur") {
+ intrinsicBlur(
+ renderscriptContext, inputArray, vectorSize, sizeX, sizeY, radius, restriction
+ )
+ }
+ val toolkitOutArray = timer.measure("ToolkitBlur") {
+ toolkit.blur(inputArray, vectorSize, sizeX, sizeY, radius, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceBlur") {
+ referenceBlur(inputArray, vectorSize, sizeX, sizeY, radius, restriction)
+ }
+ return validateSame("blur", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("blur $vectorSize ($sizeX, $sizeY) radius = $radius $restriction")
+ logArray("blur input ", inputArray)
+ logArray("blur reference out", referenceOutArray)
+ logArray("blur intrinsic out", intrinsicOutArray)
+ logArray("blur toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneBitmapBlur(
+ timer: TimingTracker,
+ bitmap: Bitmap,
+ radius: Int,
+ restriction: Range2d?
+ ): Boolean {
+ val intrinsicOutArray = timer.measure("IntrinsicBlur") {
+ intrinsicBlur(renderscriptContext, bitmap, radius, restriction)
+ }
+
+ val toolkitOutBitmap = timer.measure("ToolkitBlur") {
+ toolkit.blur(bitmap, radius, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceBlur") {
+ referenceBlur(
+ getBitmapBytes(bitmap),
+ vectorSizeOfBitmap(bitmap),
+ bitmap.width,
+ bitmap.height,
+ radius,
+ restriction
+ )
+ }
+
+ val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+ return validateSame("blur", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("BlurBitmap ${bitmap.config} $radius $restriction")
+ logArray("blur reference out", referenceOutArray)
+ logArray("blur intrinsic out", intrinsicOutArray)
+ logArray("blur toolkit out", toolkitOutArray)
+ }
+ }
+
+ enum class ColorMatrixConversionType {
+ RGB_TO_YUV,
+ YUV_TO_RGB,
+ GREYSCALE,
+ RANDOM
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testColorMatrix(timer: TimingTracker): Boolean {
+ return ColorMatrixConversionType.values().all { conversion ->
+ testOneBitmapColorMatrix(timer, testImage1, conversion, null) and
+ testOneBitmapColorMatrix(
+ timer,
+ testImage1,
+ conversion,
+ Range2d(6, 23, 2, 4)
+ ) and
+ commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+ (1..4).all { inputVectorSize ->
+ (1..4).all { outputVectorSize ->
+ testOneRandomColorMatrix(
+ timer,
+ inputVectorSize,
+ sizeX,
+ sizeY,
+ outputVectorSize,
+ conversion,
+ restriction
+ )
+ }
+ }
+ }
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomColorMatrix(
+ timer: TimingTracker,
+ inputVectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ outputVectorSize: Int,
+ conversion: ColorMatrixConversionType,
+ restriction: Range2d?
+ ): Boolean {
+ val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(inputVectorSize))
+ val addVector = randomFloatArray(0x243238, 4, 1, 1, 0.3f)
+ val matrix = when (conversion) {
+ ColorMatrixConversionType.RGB_TO_YUV -> toolkit.rgbToYuvMatrix
+ ColorMatrixConversionType.YUV_TO_RGB -> toolkit.yuvToRgbMatrix
+ ColorMatrixConversionType.GREYSCALE -> toolkit.greyScaleColorMatrix
+ ColorMatrixConversionType.RANDOM -> randomFloatArray(0x234348, 4, 4, 1)
+ }
+
+ val intrinsicOutArray = timer.measure("IntrinsicColorMatrix") {
+ intrinsicColorMatrix(
+ renderscriptContext,
+ conversion,
+ inputArray,
+ inputVectorSize,
+ sizeX,
+ sizeY,
+ outputVectorSize,
+ matrix,
+ addVector,
+ restriction
+ )
+ }
+ val toolkitOutArray = timer.measure("ToolkitColorMatrix") {
+ toolkit.colorMatrix(
+ inputArray,
+ inputVectorSize,
+ sizeX,
+ sizeY,
+ outputVectorSize,
+ matrix,
+ addVector,
+ restriction
+ )
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceColorMatrix") {
+ referenceColorMatrix(
+ inputArray, inputVectorSize, sizeX, sizeY, outputVectorSize, matrix, addVector,
+ restriction
+ )
+ }
+
+ return validateSame("colorMatrix", intrinsicOutArray, referenceOutArray, toolkitOutArray,
+ outputVectorSize == 3) {
+ println("colorMatrix ($sizeX, $sizeY) $inputVectorSize->$outputVectorSize $restriction")
+ logArray("colorMatrix matrix ", matrix, 16)
+ logArray("colorMatrix addVector", addVector, 4)
+ logArray("colorMatrix in ", inputArray)
+ logArray("colorMatrix reference out", referenceOutArray, 300)
+ logArray("colorMatrix intrinsic out", intrinsicOutArray, 300)
+ logArray("colorMatrix toolkit out", toolkitOutArray, 300)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneBitmapColorMatrix(
+ timer: TimingTracker,
+ bitmap: Bitmap,
+ conversion: ColorMatrixConversionType,
+ restriction: Range2d?
+ ): Boolean {
+ val addVector = randomFloatArray(0x243238, 4, 1, 1, 0.3f)
+ val matrix = when (conversion) {
+ ColorMatrixConversionType.RGB_TO_YUV -> toolkit.rgbToYuvMatrix
+ ColorMatrixConversionType.YUV_TO_RGB -> toolkit.yuvToRgbMatrix
+ ColorMatrixConversionType.GREYSCALE -> toolkit.greyScaleColorMatrix
+ ColorMatrixConversionType.RANDOM -> randomFloatArray(0x234348, 4, 4, 1)
+ }
+
+ val intrinsicOutArray = timer.measure("IntrinsicColorMatrix") {
+ intrinsicColorMatrix(
+ renderscriptContext, conversion, bitmap, matrix, addVector, restriction
+ )
+ }
+ val toolkitOutBitmap = timer.measure("ToolkitColorMatrix") {
+ toolkit.colorMatrix(bitmap, matrix, addVector, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceColorMatrix") {
+ referenceColorMatrix(
+ getBitmapBytes(bitmap), vectorSizeOfBitmap(bitmap), bitmap.width, bitmap.height,
+ vectorSizeOfBitmap(bitmap), matrix, addVector, restriction
+ )
+ }
+
+ val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+ return validateSame("ColorMatrix", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("colorMatrixBitmap $restriction")
+ logArray("colorMatrixBitmap matrix ", matrix, 16)
+ logArray("colorMatrixBitmap addVector", addVector, 4)
+ logArray("colorMatrixBitmap reference out", referenceOutArray)
+ logArray("colorMatrixBitmap intrinsic out", intrinsicOutArray)
+ logArray("colorMatrixBitmap toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testConvolve(timer: TimingTracker): Boolean {
+ val coefficientsToTry = listOf(
+ randomFloatArray(0x2937021, 3, 3, 1, 0.1f),
+ randomFloatArray(0x2937021, 5, 5, 1, 0.05f)
+ )
+ return coefficientsToTry.all { coefficients ->
+ testOneBitmapConvolve(timer, testImage1, coefficients, null) and
+ testOneBitmapConvolve(timer, testImage1, coefficients, Range2d(6, 23, 2, 4)) and
+
+ commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+ (1..4).all { vectorSize ->
+ testOneRandomConvolve(
+ timer,
+ vectorSize,
+ sizeX,
+ sizeY,
+ coefficients,
+ restriction
+ )
+ }
+ }
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomConvolve(
+ timer: TimingTracker,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ coefficients: FloatArray,
+ restriction: Range2d?
+ ): Boolean {
+ val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(vectorSize))
+
+ val intrinsicOutArray = timer.measure("IntrinsicConvolve") {
+ intrinsicConvolve(
+ renderscriptContext, inputArray, vectorSize, sizeX, sizeY, coefficients, restriction
+ )
+ }
+ val toolkitOutArray = timer.measure("ToolkitConvolve") {
+ toolkit.convolve(inputArray, vectorSize, sizeX, sizeY, coefficients, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceConvolve") {
+ referenceConvolve(inputArray, vectorSize, sizeX, sizeY, coefficients, restriction)
+ }
+
+ val task = if (coefficients.size == 9) "convolve3x3 $vectorSize" else "convolve5x5 $vectorSize"
+ return validateSame(task, intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("Convolve $vectorSize ($sizeX, $sizeY) $restriction")
+ logArray("Convolve coefficients", coefficients, 25)
+ logArray("Convolve in ", inputArray)
+ logArray("Convolve reference out", referenceOutArray)
+ logArray("Convolve intrinsic out", intrinsicOutArray)
+ logArray("Convolve toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneBitmapConvolve(
+ timer: TimingTracker,
+ bitmap: Bitmap,
+ coefficients: FloatArray,
+ restriction: Range2d?
+ ): Boolean {
+ val intrinsicOutArray = timer.measure("IntrinsicConvolve") {
+ intrinsicConvolve(renderscriptContext, bitmap, coefficients, restriction)
+ }
+ val toolkitOutBitmap = timer.measure("ToolkitConvolve") {
+ toolkit.convolve(bitmap, coefficients, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceConvolve") {
+ referenceConvolve(
+ getBitmapBytes(bitmap), vectorSizeOfBitmap(bitmap), bitmap.width, bitmap.height,
+ coefficients, restriction
+ )
+ }
+
+ val task = if (coefficients.size == 9) "convolve3x3" else "convolve5x5"
+ val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+ return validateSame(task, intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("ConvolveBitmap $restriction")
+ logArray("ConvolveBitmap coefficients", coefficients, 25)
+ //logArray("ConvolveBitmap in ", inputArray)
+ logArray("ConvolveBitmap reference out", referenceOutArray)
+ logArray("ConvolveBitmap intrinsic out", intrinsicOutArray)
+ logArray("ConvolveBitmap toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testHistogram(timer: TimingTracker): Boolean {
+ val coefficients = floatArrayOf(0.1f, 0.3f, 0.5f, 0.05f)
+ return testOneBitmapHistogram(timer, testImage1, null) and
+ testOneBitmapHistogram(timer, testImage1, Range2d(6, 23, 2, 4)) and
+ testOneBitmapHistogramDot(timer, testImage1, null, null) and
+ testOneBitmapHistogramDot(timer, testImage1, coefficients, null) and
+ testOneBitmapHistogramDot(timer, testImage1, coefficients, Range2d(6, 23, 2, 4)) and
+ commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+ (1..4).all { vectorSize ->
+ testOneRandomHistogram(timer, vectorSize, sizeX, sizeY, restriction) &&
+ testOneRandomHistogramDot(
+ timer,
+ vectorSize,
+ sizeX,
+ sizeY,
+ null,
+ restriction
+ ) &&
+ testOneRandomHistogramDot(
+ timer,
+ vectorSize,
+ sizeX,
+ sizeY,
+ coefficients.sliceArray(0 until vectorSize),
+ restriction
+ )
+ }
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomHistogram(
+ timer: TimingTracker,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d?
+ ): Boolean {
+ val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(vectorSize))
+
+ val intrinsicOutput = timer.measure("IntrinsicHistogram") {
+ intrinsicHistogram(
+ renderscriptContext, inputArray, vectorSize, sizeX, sizeY, restriction
+ )
+ }
+ val toolkitOutput = timer.measure("ToolkitHistogram") {
+ toolkit.histogram(inputArray, vectorSize, sizeX, sizeY, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutput = timer.measure("ReferenceHistogram") {
+ referenceHistogram(
+ inputArray, vectorSize, sizeX, sizeY, restriction
+ )
+ }
+
+ return validateSame("histogram", intrinsicOutput, referenceOutput, toolkitOutput, 0) {
+ println("histogram $vectorSize ($sizeX, $sizeY) $restriction")
+ logArray("histogram in ", inputArray, 200)
+ logArray("histogram reference out", referenceOutput, 200)
+ logArray("histogram intrinsic out", intrinsicOutput, 200)
+ logArray("histogram toolkit out", toolkitOutput, 200)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneBitmapHistogram(
+ timer: TimingTracker,
+ bitmap: Bitmap,
+ restriction: Range2d?
+ ): Boolean {
+ val intrinsicOutput = timer.measure("IntrinsicHistogram") {
+ intrinsicHistogram(renderscriptContext, bitmap, restriction)
+ }
+ val toolkitOutput = timer.measure("ToolkitHistogram") {
+ toolkit.histogram(bitmap, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutput = timer.measure("ReferenceHistogram") {
+ referenceHistogram(
+ getBitmapBytes(bitmap), vectorSizeOfBitmap(bitmap), bitmap.width, bitmap.height,
+ restriction
+ )
+ }
+
+ return validateSame("histogram", intrinsicOutput, referenceOutput, toolkitOutput, 0) {
+ println("HistogramBitmap $restriction")
+ logArray("HistogramBitmap reference out", referenceOutput)
+ logArray("HistogramBitmap intrinsic out", intrinsicOutput)
+ logArray("HistogramBitmap toolkit out", toolkitOutput)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomHistogramDot(
+ timer: TimingTracker,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ coefficients: FloatArray?, restriction: Range2d?
+ ): Boolean {
+ val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(vectorSize))
+
+ val intrinsicOutArray = timer.measure("IntrinsicHistogramDot") {
+ intrinsicHistogramDot(
+ renderscriptContext, inputArray, vectorSize, sizeX, sizeY, coefficients, restriction
+ )
+ }
+ val toolkitOutArray = timer.measure("ToolkitHistogramDot") {
+ toolkit.histogramDot(
+ inputArray, vectorSize, sizeX, sizeY, coefficients, restriction
+ )
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceHistogramDot") {
+ referenceHistogramDot(inputArray, vectorSize, sizeX, sizeY, coefficients, restriction)
+ }
+
+ return validateSame("histogramDot", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("histogramDot $vectorSize ($sizeX, $sizeY) $restriction")
+ logArray("histogramDot coefficients ", coefficients)
+ logArray("histogramDot in ", inputArray)
+ logArray("histogramDot reference out", referenceOutArray, 256)
+ logArray("histogramDot intrinsic out", intrinsicOutArray, 256)
+ logArray("histogramDot toolkit out", toolkitOutArray, 256)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneBitmapHistogramDot(
+ timer: TimingTracker,
+ bitmap: Bitmap,
+ coefficients: FloatArray?,
+ restriction: Range2d?
+ ): Boolean {
+ val intrinsicOutArray = timer.measure("IntrinsicHistogramDot") {
+ intrinsicHistogramDot(renderscriptContext, bitmap, coefficients, restriction)
+ }
+ val toolkitOutArray = timer.measure("ToolkitHistogramDot") {
+ toolkit.histogramDot(bitmap, coefficients, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceHistogramDot") {
+ referenceHistogramDot(
+ getBitmapBytes(bitmap),
+ vectorSizeOfBitmap(bitmap),
+ bitmap.width,
+ bitmap.height,
+ coefficients,
+ restriction
+ )
+ }
+
+ return validateSame(
+ "HistogramDotBitmap",
+ intrinsicOutArray,
+ referenceOutArray,
+ toolkitOutArray
+ ) {
+ println("HistogramDotBitmap $restriction")
+ logArray("HistogramDotBitmap coefficients ", coefficients)
+ //logArray("HistogramDotBitmap in ", inputArray)
+ logArray("HistogramDotBitmap reference out", referenceOutArray, 256)
+ logArray("HistogramDotBitmap intrinsic out", intrinsicOutArray, 256)
+ logArray("HistogramDotBitmap toolkit out", toolkitOutArray, 256)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testLut(timer: TimingTracker): Boolean {
+ return testOneBitmapLut(timer, testImage1, null) and
+ testOneBitmapLut(timer, testImage1, Range2d(6, 23, 2, 4)) and
+ commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+ testOneRandomLut(timer, sizeX, sizeY, restriction)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomLut(
+ timer: TimingTracker,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d?
+ ): Boolean {
+ val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, 4)
+ val newRed = randomByteArray(0x32425, 256, 1, 1)
+ val newGreen = randomByteArray(0x1F3225, 256, 1, 1)
+ val newBlue = randomByteArray(0x32D4F27, 256, 1, 1)
+ val newAlpha = randomByteArray(0x3A20001, 256, 1, 1)
+ val table = LookupTable()
+ table.red = newRed
+ table.blue = newBlue
+ table.green = newGreen
+ table.alpha = newAlpha
+
+ val intrinsicOutArray = timer.measure("IntrinsicLUT") {
+ intrinsicLut(
+ renderscriptContext, inputArray, sizeX, sizeY, newRed, newGreen, newBlue, newAlpha,
+ restriction
+ )
+ }
+ val toolkitOutArray = timer.measure("ToolkitLUT") {
+ toolkit.lut(inputArray, sizeX, sizeY, table, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceLUT") {
+ referenceLut(inputArray, sizeX, sizeY, table, restriction)
+ }
+
+ return validateSame("LUT", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("lut ($sizeX, $sizeY) $restriction")
+ logArray("LUT red ", newRed, 256)
+ logArray("LUT green", newGreen, 256)
+ logArray("LUT blue ", newBlue, 256)
+ logArray("LUT alpha", newAlpha, 256)
+ logArray("LUT in ", inputArray)
+ logArray("LUT reference out", referenceOutArray)
+ logArray("LUT intrinsic out", intrinsicOutArray)
+ logArray("LUT toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneBitmapLut(
+ timer: TimingTracker,
+ bitmap: Bitmap,
+ restriction: Range2d?
+ ): Boolean {
+ val newRed = randomByteArray(0x32425, 256, 1, 1)
+ val newGreen = randomByteArray(0x1F3225, 256, 1, 1)
+ val newBlue = randomByteArray(0x32D4F27, 256, 1, 1)
+ val newAlpha = randomByteArray(0x3A20001, 256, 1, 1)
+ val table = LookupTable()
+ table.red = newRed
+ table.blue = newBlue
+ table.green = newGreen
+ table.alpha = newAlpha
+
+ val intrinsicOutArray = timer.measure("IntrinsicLUT") {
+ intrinsicLut(
+ renderscriptContext, bitmap, newRed, newGreen, newBlue, newAlpha, restriction
+ )
+ }
+ val toolkitOutBitmap = timer.measure("ToolkitLUT") {
+ toolkit.lut(bitmap, table, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceLUT") {
+ referenceLut(
+ getBitmapBytes(bitmap),
+ bitmap.width,
+ bitmap.height,
+ table,
+ restriction
+ )
+ }
+
+ val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+ return validateSame("LutBitmap", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("LutBitmap $restriction")
+ logArray("LutBitmap red ", newRed, 256)
+ logArray("LutBitmap green", newGreen, 256)
+ logArray("LutBitmap blue ", newBlue, 256)
+ logArray("LutBitmap alpha", newAlpha, 256)
+ //logArray("LutBitmap in ", inputArray, 80)
+ logArray("LutBitmap reference out", referenceOutArray)
+ logArray("LutBitmap intrinsic out", intrinsicOutArray)
+ logArray("LutBitmap toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testLut3d(timer: TimingTracker): Boolean {
+ val cubeSizesToTry = listOf(
+ Dimension(2, 2, 2),
+ Dimension(32, 32, 16),
+ Dimension(256, 256, 256)
+ )
+ return cubeSizesToTry.all { cubeSize ->
+ val identityCube = identityCube(cubeSize)
+ val randomCube = randomCube(0x23424, cubeSize)
+ testOneBitmapLut3d(timer, testImage1, cubeSize, identityCube, 1, null) and
+ testOneBitmapLut3d(timer, testImage2, cubeSize, randomCube, 3, null) and
+ testOneBitmapLut3d(timer, testImage2, cubeSize, randomCube, 3, Range2d(6, 23, 2, 4)) and
+ commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+ testOneRandomLut3d(timer, sizeX, sizeY, cubeSize, identityCube, 1, restriction) &&
+ testOneRandomLut3d(
+ timer,
+ sizeX,
+ sizeY,
+ cubeSize,
+ randomCube,
+ 3,
+ restriction
+ )
+ }
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomLut3d(
+ timer: TimingTracker,
+ sizeX: Int,
+ sizeY: Int,
+ cubeSize: Dimension,
+ cubeArray: ByteArray,
+ allowedIntError: Int, restriction: Range2d?
+ ): Boolean {
+ val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, 4)
+
+ val intrinsicOutArray = timer.measure("IntrinsicLut3d") {
+ intrinsicLut3d(
+ renderscriptContext, inputArray, sizeX, sizeY, cubeArray, cubeSize, restriction
+ )
+ }
+ val toolkitOutArray = timer.measure("ToolkitLut3d") {
+ val toolkitCube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+ toolkit.lut3d(inputArray, sizeX, sizeY, toolkitCube, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceLut3d") {
+ val cube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+ referenceLut3d(inputArray, sizeX, sizeY, cube, restriction)
+ }
+
+ return validateSame(
+ "lut3d",
+ intrinsicOutArray,
+ referenceOutArray,
+ toolkitOutArray,
+ false,
+ allowedIntError
+ ) {
+ println("lut3d ($sizeX, $sizeY) $restriction")
+ logArray("lut3d cube", cubeArray, 256)
+ logArray("lut3d in ", inputArray, 64)
+ logArray("lut3d reference out", referenceOutArray, 64)
+ logArray("lut3d intrinsic out", intrinsicOutArray, 64)
+ logArray("lut3d toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneBitmapLut3d(
+ timer: TimingTracker,
+ bitmap: Bitmap,
+ cubeSize: Dimension,
+ cubeArray: ByteArray,
+ allowedIntError: Int, restriction: Range2d?
+ ): Boolean {
+ val intrinsicOutArray = timer.measure("IntrinsicLut3d") {
+ intrinsicLut3d(renderscriptContext, bitmap, cubeArray, cubeSize, restriction)
+ }
+ val toolkitOutBitmap = timer.measure("ToolkitLut3d") {
+ val toolkitCube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+ toolkit.lut3d(bitmap, toolkitCube, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceLut3d") {
+ val cube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+ referenceLut3d(getBitmapBytes(bitmap), bitmap.width, bitmap.height, cube, restriction)
+ }
+
+ val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+ return validateSame(
+ "Lut3dBitmap",
+ intrinsicOutArray,
+ referenceOutArray,
+ toolkitOutArray,
+ false,
+ allowedIntError
+ ) {
+ println("Lut3dBitmap $restriction")
+ logArray("Lut3dBitmap cube", cubeArray, 256)
+ //logArray("Lut3dBitmap in ", inputArray, 64)
+ logArray("Lut3dBitmap reference out", referenceOutArray, 64)
+ logArray("Lut3dBitmap intrinsic out", intrinsicOutArray, 64)
+ logArray("Lut3dBitmap toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testResize(timer: TimingTracker): Boolean {
+ val factorsToTry = listOf(
+ Pair(1f, 1f),
+ Pair(0.5f, 1f),
+ Pair(2f, 2f),
+ Pair(0.5f, 2f),
+ Pair(2f, 0.5f),
+ // The RenderScript Intrinsic tests used the above factors. It's tempting to use
+ // less regular ones like Pair(6.37f, 0.17f) however this creates small offset
+ // errors between the result provided by the C++ code and the SIMD code. This is
+ // due to the SIMD code using a scaled integer to increment going from one pixel to the
+ // next, while the C++ code uses float operations.
+ )
+ val layoutsToTry = listOf(
+ TestLayout(37, 47, null),
+ TestLayout(60, 10, null),
+ TestLayout(6, 4, Range2d(1, 3, 0, 2)),
+ TestLayout(10, 14, Range2d(2, 3, 3, 7)),
+ )
+
+ return factorsToTry.all { (scaleX, scaleY) ->
+ // Do one resize that's greater than 4x, as that's used in the code but don't do it
+ // for everything, as some images will get very large
+ testOneRandomResize(timer, 1, 25, 30, 6f, 6f, null) and
+ testOneBitmapResize(timer, testImage1, scaleX, scaleY, null) and
+ testOneBitmapResize(timer, testImage1, scaleX, scaleY, Range2d(6, 23, 2, 4)) and
+ layoutsToTry.all { (sizeX, sizeY, restriction) ->
+ (1..4).all { vectorSize ->
+ testOneRandomResize(
+ timer,
+ vectorSize,
+ sizeX,
+ sizeY,
+ scaleX,
+ scaleY,
+ restriction
+ )
+ }
+ }
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomResize(
+ timer: TimingTracker,
+ vectorSize: Int,
+ inSizeX: Int,
+ inSizeY: Int,
+ scaleX: Float,
+ scaleY: Float,
+ restriction: Range2d?
+ ): Boolean {
+ val inputArray = randomByteArray(0x50521f0, inSizeX, inSizeY, paddedSize(vectorSize))
+ val outSizeX = (inSizeX * scaleX).toInt()
+ val outSizeY = (inSizeY * scaleY).toInt()
+
+ val intrinsicOutArray = timer.measure("IntrinsicResize") {
+ intrinsicResize(
+ renderscriptContext, inputArray, vectorSize, inSizeX, inSizeY, outSizeX, outSizeY,
+ restriction
+ )
+ }
+ val toolkitOutArray = timer.measure("ToolkitResize") {
+ toolkit.resize(
+ inputArray, vectorSize, inSizeX, inSizeY, outSizeX, outSizeY, restriction
+ )
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceResize") {
+ referenceResize(
+ inputArray, vectorSize, inSizeX, inSizeY, outSizeX, outSizeY, restriction
+ )
+ }
+
+ return validateSame("resize", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("resize $vectorSize ($inSizeX, $inSizeY) by ($scaleX, $scaleY) to ($outSizeX, $outSizeY), $restriction")
+ logArray("resize in ", inputArray)
+ logArray("resize reference out", referenceOutArray)
+ logArray("resize intrinsic out", intrinsicOutArray)
+ logArray("resize toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneBitmapResize(
+ timer: TimingTracker,
+ bitmap: Bitmap,
+ scaleX: Float,
+ scaleY: Float,
+ restriction: Range2d?
+ ): Boolean {
+ // println("Doing resize $inSizeX x $inSizeY x $vectorSize, $scaleX x $scaleY, $restriction")
+ val outSizeX = (bitmap.width * scaleX).toInt()
+ val outSizeY = (bitmap.height * scaleY).toInt()
+
+ val intrinsicOutArray = timer.measure("IntrinsicResize") {
+ intrinsicResize(renderscriptContext, bitmap, outSizeX, outSizeY, restriction)
+ }
+ val toolkitOutBitmap = timer.measure("ToolkitResize") {
+ toolkit.resize(bitmap, outSizeX, outSizeY, restriction)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceResize") {
+ referenceResize(
+ getBitmapBytes(bitmap),
+ vectorSizeOfBitmap(bitmap),
+ bitmap.width,
+ bitmap.height,
+ outSizeX,
+ outSizeY,
+ restriction
+ )
+ }
+
+ val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+ return validateSame("ResizeBitmap", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("ResizeBitmap by ($scaleX, $scaleY) to ($outSizeX, $outSizeY), $restriction")
+ //logArray("ResizeBitmap in ", inputArray, 100)
+ logArray("ResizeBitmap reference out", referenceOutArray)
+ logArray("ResizeBitmap intrinsic out", intrinsicOutArray)
+ logArray("ResizeBitmap toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testYuvToRgb(timer: TimingTracker): Boolean {
+ val layoutsToTry = listOf(
+ // Don't try sizeX with odd values. That's not allowed by definition of some
+ // of the video formats.
+ TestLayout(10, 14, null),
+ TestLayout(64, 40, null),
+ TestLayout(96, 94, null),
+ )
+ return layoutsToTry.all { (sizeX, sizeY, _) ->
+ YuvFormat.values().all { format ->
+ testOneRandomYuvToRgb(timer, sizeX, sizeY, format) and
+ testOneRandomYuvToRgbBitmap(timer, sizeX, sizeY, format)
+ }
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomYuvToRgb(
+ timer: TimingTracker,
+ sizeX: Int,
+ sizeY: Int,
+ format: YuvFormat
+ ): Boolean {
+ // The RenderScript Intrinsic does not handle this combination correctly.
+ if (format == YuvFormat.YV12 && sizeX % 32 != 0) {
+ return true
+ }
+ val inputArray = randomYuvArray(0x50521f0, sizeX, sizeY, format)
+
+ val intrinsicOutArray = timer.measure("IntrinsicYuvToRgb") {
+ intrinsicYuvToRgb(renderscriptContext, inputArray, sizeX, sizeY, format)
+ }
+ val toolkitOutArray = timer.measure("ToolkitYuvToRgb") {
+ toolkit.yuvToRgb(inputArray, sizeX, sizeY, format)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceYuvToRgb") {
+ referenceYuvToRgb(inputArray, sizeX, sizeY, format)
+ }
+
+ return validateSame("yuvToRgb", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("yuvToRgb ($sizeX, $sizeY) $format")
+ logArray("yuvToRgb in ", inputArray)
+ logArray("yuvToRgb reference out", referenceOutArray)
+ logArray("yuvToRgb intrinsic out", intrinsicOutArray)
+ logArray("yuvToRgb toolkit out", toolkitOutArray)
+ }
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun testOneRandomYuvToRgbBitmap(
+ timer: TimingTracker,
+ sizeX: Int,
+ sizeY: Int,
+ format: YuvFormat
+ ): Boolean {
+ // The RenderScript Intrinsic does not handle this combination correctly.
+ if (format == YuvFormat.YV12 && sizeX % 32 != 0) {
+ return true
+ }
+ val inputArray = randomYuvArray(0x50521f0, sizeX, sizeY, format)
+
+ val intrinsicOutArray = timer.measure("IntrinsicYuvToRgb") {
+ intrinsicYuvToRgb(renderscriptContext, inputArray, sizeX, sizeY, format)
+ }
+ val toolkitOutBitmap = timer.measure("ToolkitYuvToRgb") {
+ toolkit.yuvToRgbBitmap(inputArray, sizeX, sizeY, format)
+ }
+ if (!validate) return true
+
+ val referenceOutArray = timer.measure("ReferenceYuvToRgb") {
+ referenceYuvToRgb(inputArray, sizeX, sizeY, format)
+ }
+
+ val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+ return validateSame("yuvToRgb", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+ println("yuvToRgb ($sizeX, $sizeY) $format")
+ logArray("yuvToRgb in ", inputArray)
+ logArray("yuvToRgb reference out", referenceOutArray)
+ logArray("yuvToRgb intrinsic out", intrinsicOutArray)
+ logArray("yuvToRgb toolkit out", toolkitOutArray)
+ }
+ }
+
+ /**
+ * Verifies that the arrays returned by the Intrinsic, the reference code, and the Toolkit
+ * are all within a margin of error.
+ *
+ * RenderScript Intrinsic test (rc/android/cts/rscpp/RSCppTest.java) used 3 for ints.
+ * For floats, rc/android/cts/rscpp/verify.rscript uses 0.0001f.
+ */
+ @ExperimentalUnsignedTypes
+ private fun validateSame(
+ task: String,
+ intrinsic: ByteArray,
+ reference: ByteArray,
+ toolkit: ByteArray,
+ skipFourth: Boolean = false,
+ allowedIntDelta: Int = 3,
+ errorLogging: () -> Unit
+ ): Boolean {
+ val success = validateAgainstReference(
+ task, reference, "Intrinsic", intrinsic, skipFourth, allowedIntDelta
+ ) and validateAgainstReference(
+ task, reference, "Toolkit", toolkit, skipFourth, allowedIntDelta
+ )
+ if (!success) {
+ println("$task FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!")
+ errorLogging()
+ }
+ return success
+ }
+
+ private fun validateSame(
+ task: String,
+ intrinsic: IntArray,
+ reference: IntArray,
+ toolkit: IntArray,
+ allowedIntDelta: Int = 3,
+ errorLogging: () -> Unit
+ ): Boolean {
+ val success = validateAgainstReference(
+ task, reference, "Intrinsic", intrinsic, allowedIntDelta
+ ) and validateAgainstReference(
+ task, reference, "Toolkit", toolkit, allowedIntDelta
+ )
+ if (!success) {
+ println("$task FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!")
+ errorLogging()
+ }
+ return success
+ }
+
+ @ExperimentalUnsignedTypes
+ private fun validateAgainstReference(
+ task: String,
+ in1: ByteArray,
+ name2: String,
+ in2: ByteArray,
+ skipFourth: Boolean,
+ allowedIntDelta: Int
+ ): Boolean {
+ if (in1.size != in2.size) {
+ println("$task. Sizes don't match: Reference ${in1.size}, $name2 ${in2.size}")
+ return false
+ }
+ var same = true
+ val maxDetails = 80
+ val diffs = CharArray(min(in1.size, maxDetails)) {'.'}
+ for (i in in1.indices) {
+ if (skipFourth && i % 4 == 3) {
+ continue
+ }
+ val delta = abs(in1[i].toUByte().toInt() - in2[i].toUByte().toInt())
+ if (delta > allowedIntDelta) {
+ if (same) {
+ println(
+ "$task. At $i, Reference is ${in1[i].toUByte()}, $name2 is ${in2[i].toUByte()}"
+ )
+ }
+ if (i < maxDetails) diffs[i] = 'X'
+ same = false
+ }
+ }
+ if (!same) {
+ for (i in 0 until (min(in1.size, maxDetails) / 4)) print("%-3d|".format(i))
+ println()
+ println(diffs)
+ }
+ return same
+ }
+
+ private fun validateAgainstReference(
+ task: String,
+ in1: IntArray,
+ name2: String,
+ in2: IntArray,
+ allowedIntDelta: Int
+ ): Boolean {
+ if (in1.size != in2.size) {
+ println("$task. Sizes don't match: Reference ${in1.size}, $name2 ${in2.size}")
+ return false
+ }
+ for (i in in1.indices) {
+ val delta = abs(in1[i] - in2[i])
+ if (delta > allowedIntDelta) {
+ println("$task. At $i, Reference is ${in1[i]}, $name2 is ${in2[i]}")
+ return false
+ }
+ }
+ return true
+ }
+}
diff --git a/toolkit/test/Android.bp b/toolkit/test/Android.bp
new file mode 100644
index 0000000..abeace1
--- /dev/null
+++ b/toolkit/test/Android.bp
@@ -0,0 +1,35 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+ default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_app {
+ name: "RenderScriptToolkitTest",
+// srcs: ["src/**/*.kt"],
+ sdk_version: "current",
+ resource_dirs: ["res"],
+// jni_libs: [ "librenderscripttoolkit"],
+// certificate: "platform",
+// //product_specific: true,
+// //optimize: {
+// // proguard_flags_files: ["proguard.flags"],
+// //},
+// shared_libs: ["librenderscripttoolkit",
+//
+// ]
+}
diff --git a/toolkit/test/AndroidManifest.xml b/toolkit/test/AndroidManifest.xml
new file mode 100644
index 0000000..f709790
--- /dev/null
+++ b/toolkit/test/AndroidManifest.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+ package="com.example.testapp">
+
+ <application
+ android:allowBackup="true"
+ android:label="Toolkit Test"
+ android:supportsRtl="true">
+ <activity android:name=".MainActivity">
+ <intent-filter>
+ <action android:name="android.intent.action.MAIN" />
+
+ <category android:name="android.intent.category.LAUNCHER" />
+ </intent-filter>
+ </activity>
+ </application>
+</manifest>
diff --git a/toolkit/test/BufferUtils.kt b/toolkit/test/BufferUtils.kt
new file mode 100644
index 0000000..f2197b0
--- /dev/null
+++ b/toolkit/test/BufferUtils.kt
@@ -0,0 +1,508 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.graphics.Canvas
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.toolkit.Range2d
+import android.renderscript.toolkit.Rgba3dArray
+import android.renderscript.toolkit.YuvFormat
+import java.nio.ByteBuffer
+import java.util.Random
+import kotlin.math.floor
+import kotlin.math.max
+import kotlin.math.min
+
+/**
+ * A vector of 4 integers.
+ */
+class Int4(
+ var x: Int = 0,
+ var y: Int = 0,
+ var z: Int = 0,
+ var w: Int = 0
+) {
+ operator fun plus(other: Int4) = Int4(x + other.x, y + other.y, z + other.z, w + other.w)
+ operator fun plus(n: Int) = Int4(x + n, y + n, z + n, w + n)
+
+ operator fun minus(other: Int4) = Int4(x - other.x, y - other.y, z - other.z, w - other.w)
+ operator fun minus(n: Int) = Int4(x - n, y - n, z - n, w - n)
+
+ operator fun times(other: Int4) = Int4(x * other.x, y * other.y, z * other.z, w * other.w)
+ operator fun times(n: Int) = Int4(x * n, y * n, z * n, w * n)
+
+ fun toFloat4() = Float4(x.toFloat(), y.toFloat(), z.toFloat(), w.toFloat())
+}
+
+fun min(a: Int4, b: Int4) = Int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w))
+
+/**
+ * A vector of 4 floats.
+ */
+data class Float4(
+ var x: Float = 0f,
+ var y: Float = 0f,
+ var z: Float = 0f,
+ var w: Float = 0f
+) {
+ operator fun plus(other: Float4) = Float4(x + other.x, y + other.y, z + other.z, w + other.w)
+ operator fun plus(f: Float) = Float4(x + f, y + f, z + f, w + f)
+
+ operator fun minus(other: Float4) = Float4(x - other.x, y - other.y, z - other.z, w - other.w)
+ operator fun minus(f: Float) = Float4(x - f, y - f, z - f, w - f)
+
+ operator fun times(other: Float4) = Float4(x * other.x, y * other.y, z * other.z, w * other.w)
+ operator fun times(f: Float) = Float4(x * f, y * f, z * f, w * f)
+
+ operator fun div(other: Float4) = Float4(x / other.x, y / other.y, z / other.z, w / other.w)
+ operator fun div(f: Float) = Float4(x / f, y / f, z / f, w / f)
+
+ fun intFloor() = Int4(floor(x).toInt(), floor(y).toInt(), floor(z).toInt(), floor(w).toInt())
+}
+
+/**
+ * Convert a UByteArray to a Float4 vector
+ */
+@ExperimentalUnsignedTypes
+fun UByteArray.toFloat4(): Float4 {
+ require(size == 4)
+ return Float4(this[0].toFloat(), this[1].toFloat(), this[2].toFloat(), this[3].toFloat())
+}
+
+/**
+ * Convert a ByteArray to a Float4 vector
+ */
+@ExperimentalUnsignedTypes
+fun ByteArray.toFloat4(): Float4 {
+ require(size == 4)
+ return Float4(
+ this[0].toUByte().toFloat(),
+ this[1].toUByte().toFloat(),
+ this[2].toUByte().toFloat(),
+ this[3].toUByte().toFloat()
+ )
+}
+
+data class Dimension(val sizeX: Int, val sizeY: Int, val sizeZ: Int)
+
+/**
+ * An RGBA value represented by 4 Int.
+ *
+ * Note that the arithmetical operations consider a 0..255 value the equivalent of 0f..1f.
+ * After adding or subtracting, the value is clamped. After multiplying, the value is rescaled to
+ * stay in the 0..255 range. This is useful for the Blend operation.
+ */
+@ExperimentalUnsignedTypes
+data class Rgba(
+ var r: Int = 0,
+ var g: Int = 0,
+ var b: Int = 0,
+ var a: Int = 0
+) {
+ operator fun plus(other: Rgba) =
+ Rgba(r + other.r, g + other.g, b + other.b, a + other.a).clampToUByteRange()
+
+ operator fun minus(other: Rgba) =
+ Rgba(r - other.r, g - other.g, b - other.b, a - other.a).clampToUByteRange()
+
+ operator fun times(other: Rgba) = Rgba(r * other.r, g * other.g, b * other.b, a * other.a) shr 8
+ operator fun times(scalar: Int) = Rgba(r * scalar, g * scalar, b * scalar, a * scalar) shr 8
+
+ infix fun xor(other: Rgba) = Rgba(r xor other.r, g xor other.g, b xor other.b, a xor other.a)
+
+ infix fun shr(other: Int) = Rgba(r shr other, g shr other, b shr other, a shr other)
+
+ private fun clampToUByteRange() = Rgba(
+ r.clampToUByteRange(),
+ g.clampToUByteRange(),
+ b.clampToUByteRange(),
+ a.clampToUByteRange()
+ )
+}
+
+/**
+ * A 2D array of UByte vectors, stored in row-major format.
+ *
+ * Arrays of vectorSize == 3 are padded to 4.
+ */
+@ExperimentalUnsignedTypes
+class Vector2dArray(
+ val values: UByteArray,
+ val vectorSize: Int,
+ val sizeX: Int,
+ val sizeY: Int
+) {
+ /**
+ * If true, index access that would try to get a value that's out of bounds will simply
+ * return the border value instead. E.g. for [3, -3] would return the value for [3, 0],
+ * assuming that the sizeX > 3.
+ */
+ var clipReadToRange: Boolean = false
+
+ operator fun get(x: Int, y: Int): UByteArray {
+ var fixedX = x
+ var fixedY = y
+ if (clipReadToRange) {
+ fixedX = min(max(x, 0), sizeX - 1)
+ fixedY = min(max(y, 0), sizeY - 1)
+ } else {
+ require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
+ }
+ val start = indexOfVector(fixedX, fixedY)
+ return UByteArray(paddedSize(vectorSize)) { values[start + it] }
+ }
+
+ operator fun set(x: Int, y: Int, value: UByteArray) {
+ require(value.size == paddedSize(vectorSize)) { "Not the expected vector size" }
+ require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
+ val start = indexOfVector(x, y)
+ for (i in value.indices) {
+ values[start + i] = value[i]
+ }
+ }
+
+ private fun indexOfVector(x: Int, y: Int) = ((y * sizeX) + x) * paddedSize(vectorSize)
+
+ fun createSameSized() = Vector2dArray(UByteArray(values.size), vectorSize, sizeX, sizeY)
+
+ fun forEach(restriction: Range2d?, work: (Int, Int) -> (Unit)) {
+ forEachCell(sizeX, sizeY, restriction, work)
+ }
+}
+
+/**
+ * A 2D array of float vectors, stored in row-major format.
+ *
+ * Arrays of vectorSize == 3 are padded to 4.
+ */
+class FloatVector2dArray(
+ val values: FloatArray,
+ val vectorSize: Int,
+ val sizeX: Int,
+ val sizeY: Int
+) {
+ /**
+ * If true, index access that would try to get a value that's out of bounds will simply
+ * return the border value instead. E.g. for [3, -3] would return the value for [3, 0],
+ * assuming that the sizeX > 3.
+ */
+ var clipAccessToRange: Boolean = false
+
+ operator fun get(x: Int, y: Int): FloatArray {
+ var fixedX = x
+ var fixedY = y
+ if (clipAccessToRange) {
+ fixedX = min(max(x, 0), sizeX - 1)
+ fixedY = min(max(y, 0), sizeY - 1)
+ } else {
+ require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
+ }
+ val start = indexOfVector(fixedX, fixedY)
+ return FloatArray(vectorSize) { values[start + it] }
+ }
+
+ operator fun set(x: Int, y: Int, value: FloatArray) {
+ require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
+ val start = indexOfVector(x, y)
+ for (i in value.indices) {
+ values[start + i] = value[i]
+ }
+ }
+
+ private fun indexOfVector(x: Int, y: Int) = ((y * sizeX) + x) * paddedSize(vectorSize)
+
+ fun createSameSized() = FloatVector2dArray(FloatArray(values.size), vectorSize, sizeX, sizeY)
+
+ fun forEach(restriction: Range2d?, work: (Int, Int) -> (Unit)) {
+ forEachCell(sizeX, sizeY, restriction, work)
+ }
+}
+
+/**
+ * A 2D array of RGBA data.
+ */
+@ExperimentalUnsignedTypes
+class Rgba2dArray(
+ private val values: ByteArray,
+ val sizeX: Int,
+ val sizeY: Int
+) {
+ operator fun get(x: Int, y: Int): Rgba {
+ val i = indexOfVector(x, y)
+ return Rgba(
+ values[i].toUByte().toInt(),
+ values[i + 1].toUByte().toInt(),
+ values[i + 2].toUByte().toInt(),
+ values[i + 3].toUByte().toInt()
+ )
+ }
+
+ operator fun set(x: Int, y: Int, value: Rgba) {
+ // Verify that x, y, z, w are in the 0..255 range
+ require(value.r in 0..255)
+ require(value.g in 0..255)
+ require(value.b in 0..255)
+ require(value.a in 0..255)
+ val i = indexOfVector(x, y)
+ values[i] = value.r.toUByte().toByte()
+ values[i + 1] = value.g.toUByte().toByte()
+ values[i + 2] = value.b.toUByte().toByte()
+ values[i + 3] = value.a.toUByte().toByte()
+ }
+
+ private fun indexOfVector(x: Int, y: Int) = ((y * sizeX) + x) * 4
+
+ fun forEachCell(restriction: Range2d?, work: (Int, Int) -> (Unit)) =
+ forEachCell(sizeX, sizeY, restriction, work)
+}
+
+/**
+ * Return a value that's between start and end, with the fraction indicating how far along.
+ */
+fun mix(start: Float, end: Float, fraction: Float) = start + (end - start) * fraction
+
+fun mix(a: Float4, b: Float4, fraction: Float) = Float4(
+ mix(a.x, b.x, fraction),
+ mix(a.y, b.y, fraction),
+ mix(a.z, b.z, fraction),
+ mix(a.w, b.w, fraction)
+)
+
+/**
+ * For vectors of size 3, the original RenderScript has them occupy the same space as a size 4.
+ * While RenderScript had a method to avoid this padding, it did not apply to Intrinsics.
+ *
+ * To preserve compatibility, the Toolkit doing the same.
+ */
+fun paddedSize(vectorSize: Int) = if (vectorSize == 3) 4 else vectorSize
+
+/**
+ * Create a ByteArray of the specified size filled with random data.
+ */
+fun randomByteArray(seed: Long, sizeX: Int, sizeY: Int, elementSize: Int): ByteArray {
+ val r = Random(seed)
+ return ByteArray(sizeX * sizeY * elementSize) { (r.nextInt(255) - 128).toByte() }
+}
+
+/**
+ * Create a FloatArray of the specified size filled with random data.
+ *
+ * By default, the random data is between 0f and 1f. The factor can be used to scale that.
+ */
+fun randomFloatArray(
+ seed: Long,
+ sizeX: Int,
+ sizeY: Int,
+ elementSize: Int,
+ factor: Float = 1f
+): FloatArray {
+ val r = Random(seed)
+ return FloatArray(sizeX * sizeY * elementSize) { r.nextFloat() * factor }
+}
+
+/**
+ * Create a cube of the specified size filled with random data.
+ */
+fun randomCube(seed: Long, cubeSize: Dimension): ByteArray {
+ val r = Random(seed)
+ return ByteArray(cubeSize.sizeX * cubeSize.sizeY * cubeSize.sizeZ * 4) {
+ (r.nextInt(255) - 128).toByte()
+ }
+}
+
+/**
+ * Create the identity cube, i.e. one that if used in Lut3d, the output is the same as the input
+ */
+@ExperimentalUnsignedTypes
+fun identityCube(cubeSize: Dimension): ByteArray {
+ val data = ByteArray(cubeSize.sizeX * cubeSize.sizeY * cubeSize.sizeZ * 4)
+ val cube = Rgba3dArray(data, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+ for (z in 0 until cubeSize.sizeZ) {
+ for (y in 0 until cubeSize.sizeY) {
+ for (x in 0 until cubeSize.sizeX) {
+ cube[x, y, z] =
+ byteArrayOf(
+ (x * 255 / (cubeSize.sizeX - 1)).toByte(),
+ (y * 255 / (cubeSize.sizeY - 1)).toByte(),
+ (z * 255 / (cubeSize.sizeZ - 1)).toByte(),
+ (255).toByte()
+ )
+ }
+ }
+ }
+ return data
+}
+
+fun randomYuvArray(seed: Long, sizeX: Int, sizeY: Int, format: YuvFormat): ByteArray {
+ // YUV formats are not well defined for odd dimensions
+ require(sizeX % 2 == 0 && sizeY % 2 == 0)
+ val halfSizeX = sizeX / 2
+ val halfSizeY = sizeY / 2
+ var totalSize = 0
+ when (format) {
+ YuvFormat.YV12 -> {
+ val strideX = roundUpTo16(sizeX)
+ totalSize = strideX * sizeY + roundUpTo16(strideX / 2) * halfSizeY * 2
+ }
+ YuvFormat.NV21 -> totalSize = sizeX * sizeY + halfSizeX * halfSizeY * 2
+ else -> require(false) { "Unknown YUV format $format" }
+ }
+
+ return randomByteArray(seed, totalSize, 1, 1)
+}
+
+/**
+ * Converts a float to a byte, clamping to make it fit the limited range.
+ */
+@ExperimentalUnsignedTypes
+fun Float.clampToUByte(): UByte = min(255, max(0, (this + 0.5f).toInt())).toUByte()
+
+/**
+ * Converts a FloatArray to UByteArray, clamping.
+ */
+@ExperimentalUnsignedTypes
+fun FloatArray.clampToUByte() = UByteArray(size) { this[it].clampToUByte() }
+
+/**
+ * Limits an Int to what can fit in a UByte.
+ */
+fun Int.clampToUByteRange(): Int = min(255, max(0, this))
+
+/**
+ * Converts an Int to a UByte, clamping.
+ */
+@ExperimentalUnsignedTypes
+fun Int.clampToUByte(): UByte = this.clampToUByteRange().toUByte()
+
+/**
+ * Converts a float (0f .. 1f) to a byte (0 .. 255)
+ */
+@ExperimentalUnsignedTypes
+fun unitFloatClampedToUByte(num: Float): UByte = (num * 255f).clampToUByte()
+
+/**
+ * Convert a byte (0 .. 255) to a float (0f .. 1f)
+ */
+@ExperimentalUnsignedTypes
+fun byteToUnitFloat(num: UByte) = num.toFloat() * 0.003921569f
+
+@ExperimentalUnsignedTypes
+fun UByteArray.toFloatArray() = FloatArray(size) { this[it].toFloat() }
+
+/**
+ * For each cell that's in the 2D array defined by sizeX and sizeY, and clipped down by the
+ * restriction, invoke the work function.
+ */
+fun forEachCell(sizeX: Int, sizeY: Int, restriction: Range2d?, work: (Int, Int) -> (Unit)) {
+ val startX = restriction?.startX ?: 0
+ val startY = restriction?.startY ?: 0
+ val endX = restriction?.endX ?: sizeX
+ val endY = restriction?.endY ?: sizeY
+ for (y in startY until endY) {
+ for (x in startX until endX) {
+ work(x, y)
+ }
+ }
+}
+
+operator fun FloatArray.times(other: FloatArray) = FloatArray(size) { this[it] * other[it] }
+operator fun FloatArray.times(other: Float) = FloatArray(size) { this[it] * other }
+operator fun FloatArray.plus(other: FloatArray) = FloatArray(size) { this[it] + other[it] }
+operator fun FloatArray.minus(other: FloatArray) = FloatArray(size) { this[it] - other[it] }
+
+fun renderScriptVectorElementForU8(rs: RenderScript?, vectorSize: Int): Element {
+ when (vectorSize) {
+ 1 -> return Element.U8(rs)
+ 2 -> return Element.U8_2(rs)
+ 3 -> return Element.U8_3(rs)
+ 4 -> return Element.U8_4(rs)
+ }
+ throw java.lang.IllegalArgumentException("RenderScriptToolkit tests. Only vectors of size 1-4 are supported. $vectorSize provided.")
+}
+
+fun renderScriptVectorElementForI32(rs: RenderScript?, vectorSize: Int): Element {
+ when (vectorSize) {
+ 1 -> return Element.I32(rs)
+ 2 -> return Element.I32_2(rs)
+ 3 -> return Element.I32_3(rs)
+ 4 -> return Element.I32_4(rs)
+ }
+ throw java.lang.IllegalArgumentException("RenderScriptToolkit tests. Only vectors of size 1-4 are supported. $vectorSize provided.")
+}
+
+/* When we'll handle floats
+fun renderScriptVectorElementForF32(rs: RenderScript?, vectorSize: Int): Element {
+ when (vectorSize) {
+ 1 -> return Element.F32(rs)
+ 2 -> return Element.F32_2(rs)
+ 3 -> return Element.F32_3(rs)
+ 4 -> return Element.F32_4(rs)
+ }
+ throw java.lang.IllegalArgumentException("RenderScriptToolkit tests. Only vectors of size 1-4 are supported. $vectorSize provided.")
+}*/
+
+fun renderScriptElementForBitmap(context: RenderScript, bitmap: Bitmap): Element {
+ return when (val config = bitmap.config) {
+ Bitmap.Config.ALPHA_8 -> Element.A_8(context)
+ Bitmap.Config.ARGB_8888 -> Element.RGBA_8888(context)
+ else -> throw IllegalArgumentException("RenderScript Toolkit can't support bitmaps with config $config.")
+ }
+}
+
+fun getBitmapBytes(bitmap: Bitmap): ByteArray {
+ val buffer: ByteBuffer = ByteBuffer.allocate(bitmap.byteCount)
+ bitmap.copyPixelsToBuffer(buffer)
+ return buffer.array()
+}
+
+fun vectorSizeOfBitmap(bitmap: Bitmap): Int {
+ return when (val config = bitmap.config) {
+ Bitmap.Config.ALPHA_8 -> 1
+ Bitmap.Config.ARGB_8888 -> 4
+ else -> throw IllegalArgumentException("RenderScript Toolkit can't support bitmaps with config $config.")
+ }
+}
+
+fun duplicateBitmap(original: Bitmap): Bitmap {
+ val copy = Bitmap.createBitmap(original.width, original.height, original.config)
+ val canvas = Canvas(copy)
+ canvas.drawBitmap(original, 0f, 0f, null)
+ return copy
+}
+
+@ExperimentalUnsignedTypes
+fun logArray(prefix: String, array: ByteArray, number: Int = 20) {
+ val values = array.joinToString(limit = number) { it.toUByte().toString() }
+ println("$prefix[${array.size}] $values}\n")
+}
+
+fun logArray(prefix: String, array: IntArray, number: Int = 20) {
+ val values = array.joinToString(limit = number)
+ println("$prefix[${array.size}] $values}\n")
+}
+
+fun logArray(prefix: String, array: FloatArray?, number: Int = 20) {
+ val values = array?.joinToString(limit = number) { "%.2f".format(it) } ?: "(null)"
+ println("$prefix[${array?.size}] $values}\n")
+}
+
+fun roundUpTo16(value: Int): Int {
+ require(value >= 0)
+ return (value + 15) and 15.inv()
+}
diff --git a/toolkit/test/IntrinsicBlend.kt b/toolkit/test/IntrinsicBlend.kt
new file mode 100644
index 0000000..873cb15
--- /dev/null
+++ b/toolkit/test/IntrinsicBlend.kt
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicBlend
+import android.renderscript.Type
+import android.renderscript.toolkit.BlendingMode
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Blend operation using the RenderScript Intrinsics.
+ */
+fun intrinsicBlend(
+ context: RenderScript,
+ mode: BlendingMode,
+ sourceArray: ByteArray,
+ destArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d?
+) {
+ val scriptBlend = ScriptIntrinsicBlend.create(context, Element.U8_4(context))
+ val builder = Type.Builder(context, Element.U8_4(context))
+ builder.setX(sizeX)
+ builder.setY(sizeY)
+ val arrayType = builder.create()
+ val sourceAllocation = Allocation.createTyped(context, arrayType)
+ val destAllocation = Allocation.createTyped(context, arrayType)
+ sourceAllocation.copyFrom(sourceArray)
+ destAllocation.copyFrom(destArray)
+
+ callBlendForEach(scriptBlend, sourceAllocation, destAllocation, mode, restriction)
+ destAllocation.copyTo(destArray)
+
+ sourceAllocation.destroy()
+ destAllocation.destroy()
+ arrayType.destroy()
+ scriptBlend.destroy()
+}
+
+fun intrinsicBlend(
+ context: RenderScript,
+ mode: BlendingMode,
+ sourceBitmap: Bitmap,
+ destBitmap: Bitmap,
+ restriction: Range2d?
+) {
+ val scriptBlend = ScriptIntrinsicBlend.create(context, Element.U8_4(context))
+ val sourceAllocation = Allocation.createFromBitmap(context, sourceBitmap)
+ val destAllocation = Allocation.createFromBitmap(context, destBitmap)
+ sourceAllocation.copyFrom(sourceBitmap)
+ destAllocation.copyFrom(destBitmap)
+
+ callBlendForEach(scriptBlend, sourceAllocation, destAllocation, mode, restriction)
+ destAllocation.copyTo(destBitmap)
+
+ sourceAllocation.destroy()
+ destAllocation.destroy()
+ scriptBlend.destroy()
+}
+
+private fun callBlendForEach(
+ scriptBlend: ScriptIntrinsicBlend,
+ sourceAllocation: Allocation,
+ destAllocation: Allocation,
+ mode: BlendingMode,
+ restriction: Range2d?
+) {
+ if (restriction != null) {
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ when (mode) {
+ BlendingMode.CLEAR -> scriptBlend.forEachClear(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.SRC -> scriptBlend.forEachSrc(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.DST -> scriptBlend.forEachDst(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.SRC_OVER -> scriptBlend.forEachSrcOver(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.DST_OVER -> scriptBlend.forEachDstOver(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.SRC_IN -> scriptBlend.forEachSrcIn(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.DST_IN -> scriptBlend.forEachDstIn(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.SRC_OUT -> scriptBlend.forEachSrcOut(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.DST_OUT -> scriptBlend.forEachDstOut(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.SRC_ATOP -> scriptBlend.forEachSrcAtop(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.DST_ATOP -> scriptBlend.forEachDstAtop(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.XOR -> scriptBlend.forEachXor(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.MULTIPLY -> scriptBlend.forEachMultiply(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.ADD -> scriptBlend.forEachAdd(
+ sourceAllocation, destAllocation, options
+ )
+ BlendingMode.SUBTRACT -> scriptBlend.forEachSubtract(
+ sourceAllocation, destAllocation, options
+ )
+ }
+ } else {
+ when (mode) {
+ BlendingMode.CLEAR -> scriptBlend.forEachClear(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.SRC -> scriptBlend.forEachSrc(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.DST -> scriptBlend.forEachDst(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.SRC_OVER -> scriptBlend.forEachSrcOver(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.DST_OVER -> scriptBlend.forEachDstOver(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.SRC_IN -> scriptBlend.forEachSrcIn(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.DST_IN -> scriptBlend.forEachDstIn(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.SRC_OUT -> scriptBlend.forEachSrcOut(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.DST_OUT -> scriptBlend.forEachDstOut(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.SRC_ATOP -> scriptBlend.forEachSrcAtop(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.DST_ATOP -> scriptBlend.forEachDstAtop(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.XOR -> scriptBlend.forEachXor(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.MULTIPLY -> scriptBlend.forEachMultiply(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.ADD -> scriptBlend.forEachAdd(
+ sourceAllocation, destAllocation
+ )
+ BlendingMode.SUBTRACT -> scriptBlend.forEachSubtract(
+ sourceAllocation, destAllocation
+ )
+ }
+ }
+}
diff --git a/toolkit/test/IntrinsicBlur.kt b/toolkit/test/IntrinsicBlur.kt
new file mode 100644
index 0000000..be09094
--- /dev/null
+++ b/toolkit/test/IntrinsicBlur.kt
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicBlur
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Blur operation using the RenderScript Intrinsics.
+ */
+fun intrinsicBlur(
+ context: RenderScript,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ radius: Int,
+ restriction: Range2d?
+): ByteArray {
+ val scriptBlur = ScriptIntrinsicBlur.create(
+ context,
+ if (vectorSize == 4) Element.RGBA_8888(context) else Element.U8(context)
+ )
+ val builder =
+ Type.Builder(
+ context,
+ renderScriptVectorElementForU8(context, vectorSize)
+ )
+ builder.setX(sizeX)
+ builder.setY(sizeY)
+ val arrayType = builder.create()
+ val inputAllocation = Allocation.createTyped(context, arrayType)
+ inputAllocation.copyFrom(inputArray)
+ val outAllocation = Allocation.createTyped(context, arrayType)
+
+ val intrinsicOutArray = ByteArray(sizeX * sizeY * vectorSize)
+ scriptBlur.setRadius(radius.toFloat())
+ scriptBlur.setInput(inputAllocation)
+
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptBlur.forEach(outAllocation, options)
+ } else {
+ scriptBlur.forEach(outAllocation)
+ }
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ arrayType.destroy()
+ scriptBlur.destroy()
+ return intrinsicOutArray
+}
+
+fun intrinsicBlur(
+ context: RenderScript,
+ bitmap: Bitmap,
+ radius: Int,
+ restriction: Range2d?
+): ByteArray {
+ val baseElement = renderScriptElementForBitmap(context, bitmap)
+ val scriptBlur = ScriptIntrinsicBlur.create(context, baseElement)
+ val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+ inputAllocation.copyFrom(bitmap)
+ val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+ val intrinsicOutArray = ByteArray(bitmap.byteCount)
+
+ scriptBlur.setRadius(radius.toFloat())
+ scriptBlur.setInput(inputAllocation)
+
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptBlur.forEach(outAllocation, options)
+ } else {
+ scriptBlur.forEach(outAllocation)
+ }
+ outAllocation.copyTo(intrinsicOutArray)
+
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ scriptBlur.destroy()
+ return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicColorMatrix.kt b/toolkit/test/IntrinsicColorMatrix.kt
new file mode 100644
index 0000000..c0ccc67
--- /dev/null
+++ b/toolkit/test/IntrinsicColorMatrix.kt
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Matrix4f
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicColorMatrix
+import android.renderscript.Type
+import android.renderscript.Float4
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a ColorMatrix operation using the RenderScript Intrinsics.
+ */
+fun intrinsicColorMatrix(
+ context: RenderScript,
+ conversion: Tester.ColorMatrixConversionType,
+ inputArray: ByteArray,
+ inputVectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ outputVectorSize: Int,
+ matrix: FloatArray,
+ addVector: FloatArray,
+ restriction: Range2d?
+): ByteArray {
+ val scriptColorMatrix = ScriptIntrinsicColorMatrix.create(context)
+ val inputBuilder = Type.Builder(
+ context, renderScriptVectorElementForU8(
+ context,
+ inputVectorSize
+ )
+ )
+ inputBuilder.setX(sizeX)
+ inputBuilder.setY(sizeY)
+ val inputArrayType = inputBuilder.create()
+ val inputAllocation = Allocation.createTyped(context, inputArrayType)
+ val outputBuilder = Type.Builder(
+ context, renderScriptVectorElementForU8(
+ context,
+ outputVectorSize
+ )
+ )
+ outputBuilder.setX(sizeX)
+ outputBuilder.setY(sizeY)
+ val outputArrayType = outputBuilder.create()
+ val outAllocation = Allocation.createTyped(context, outputArrayType)
+
+ inputAllocation.copyFrom(inputArray)
+ val intrinsicOutArray = ByteArray(sizeX * sizeY * paddedSize(outputVectorSize))
+ when (conversion) {
+ Tester.ColorMatrixConversionType.RGB_TO_YUV -> scriptColorMatrix.setRGBtoYUV()
+ Tester.ColorMatrixConversionType.YUV_TO_RGB -> scriptColorMatrix.setYUVtoRGB()
+ Tester.ColorMatrixConversionType.GREYSCALE -> scriptColorMatrix.setGreyscale()
+ Tester.ColorMatrixConversionType.RANDOM -> {
+ val m = Matrix4f()
+ var index = 0
+ // RS is column major
+ for (x in 0..3) {
+ for (y in 0..3) {
+ m.set(x, y, matrix[index++])
+ }
+ }
+ scriptColorMatrix.setColorMatrix(m)
+ }
+ }
+ val vector = Float4(
+ addVector[0],
+ addVector[1],
+ addVector[2],
+ addVector[3]
+ )
+ scriptColorMatrix.setAdd(vector)
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptColorMatrix.forEach(inputAllocation, outAllocation, options)
+ } else {
+ scriptColorMatrix.forEach(inputAllocation, outAllocation)
+ }
+ outAllocation.copyTo(intrinsicOutArray)
+
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ inputArrayType.destroy()
+ outputArrayType.destroy()
+ scriptColorMatrix.destroy()
+ return intrinsicOutArray
+}
+
+fun intrinsicColorMatrix(
+ context: RenderScript,
+ conversion: Tester.ColorMatrixConversionType,
+ bitmap: Bitmap,
+ matrix: FloatArray,
+ addVector: FloatArray,
+ restriction: Range2d?
+): ByteArray {
+ val scriptColorMatrix = ScriptIntrinsicColorMatrix.create(context)
+ val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+ inputAllocation.copyFrom(bitmap)
+ val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+ val intrinsicOutArray = ByteArray(bitmap.byteCount)
+
+ when (conversion) {
+ Tester.ColorMatrixConversionType.RGB_TO_YUV -> scriptColorMatrix.setRGBtoYUV()
+ Tester.ColorMatrixConversionType.YUV_TO_RGB -> scriptColorMatrix.setYUVtoRGB()
+ Tester.ColorMatrixConversionType.GREYSCALE -> scriptColorMatrix.setGreyscale()
+ Tester.ColorMatrixConversionType.RANDOM -> {
+ val m = Matrix4f()
+ var index = 0
+ // RS is column major
+ for (x in 0..3) {
+ for (y in 0..3) {
+ m.set(x, y, matrix[index++])
+ }
+ }
+ scriptColorMatrix.setColorMatrix(m)
+ }
+ }
+ val vector = Float4(
+ addVector[0],
+ addVector[1],
+ addVector[2],
+ addVector[3]
+ )
+ scriptColorMatrix.setAdd(vector)
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptColorMatrix.forEach(inputAllocation, outAllocation, options)
+ } else {
+ scriptColorMatrix.forEach(inputAllocation, outAllocation)
+ }
+ outAllocation.copyTo(intrinsicOutArray)
+
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ scriptColorMatrix.destroy()
+ return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicConvolve.kt b/toolkit/test/IntrinsicConvolve.kt
new file mode 100644
index 0000000..0c9e4f0
--- /dev/null
+++ b/toolkit/test/IntrinsicConvolve.kt
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicConvolve3x3
+import android.renderscript.ScriptIntrinsicConvolve5x5
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Convolve operation using the RenderScript Intrinsics.
+ */
+fun intrinsicConvolve(
+ context: RenderScript,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ coefficients: FloatArray,
+ restriction: Range2d?
+): ByteArray {
+ val baseElement = renderScriptVectorElementForU8(context, vectorSize)
+ val builder = Type.Builder(context, baseElement)
+ builder.setX(sizeX)
+ builder.setY(sizeY)
+ val arrayType = builder.create()
+ val inputAllocation = Allocation.createTyped(context, arrayType)
+ val outAllocation = Allocation.createTyped(context, arrayType)
+ inputAllocation.copyFrom(inputArray)
+ val intrinsicOutArray = ByteArray(sizeX * sizeY * paddedSize(vectorSize))
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ }
+ invokeConvolveKernel(
+ coefficients,
+ context,
+ baseElement,
+ inputAllocation,
+ restriction,
+ outAllocation
+ )
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ arrayType.destroy()
+ return intrinsicOutArray
+}
+
+fun intrinsicConvolve(
+ context: RenderScript,
+ bitmap: Bitmap,
+ coefficients: FloatArray,
+ restriction: Range2d?
+): ByteArray {
+ val baseElement = renderScriptElementForBitmap(context, bitmap)
+
+ val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+ val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+ val intrinsicOutArray = ByteArray(bitmap.byteCount)
+ inputAllocation.copyFrom(bitmap)
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ }
+ invokeConvolveKernel(
+ coefficients,
+ context,
+ baseElement,
+ inputAllocation,
+ restriction,
+ outAllocation
+ )
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ return intrinsicOutArray
+}
+
+private fun invokeConvolveKernel(
+ coefficients: FloatArray,
+ context: RenderScript,
+ baseElement: Element,
+ inputAllocation: Allocation?,
+ restriction: Range2d?,
+ outAllocation: Allocation?
+) {
+ when (coefficients.size) {
+ 9 -> {
+ val scriptConvolve3x3 =
+ ScriptIntrinsicConvolve3x3.create(context, baseElement)
+ scriptConvolve3x3.setCoefficients(coefficients)
+ scriptConvolve3x3.setInput(inputAllocation)
+ if (restriction != null) {
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptConvolve3x3.forEach(outAllocation, options)
+ } else {
+ scriptConvolve3x3.forEach(outAllocation)
+ }
+ scriptConvolve3x3.destroy()
+ }
+ 25 -> {
+ val scriptConvolve5x5 =
+ ScriptIntrinsicConvolve5x5.create(context, baseElement)
+ scriptConvolve5x5.setCoefficients(coefficients)
+ scriptConvolve5x5.setInput(inputAllocation)
+ if (restriction != null) {
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptConvolve5x5.forEach(outAllocation, options)
+ } else {
+ scriptConvolve5x5.forEach(outAllocation)
+ }
+ scriptConvolve5x5.destroy()
+ }
+ else -> {
+ throw IllegalArgumentException("RenderScriptToolkit tests. Only 3x3 and 5x5 convolutions are supported. ${coefficients.size} coefficients provided.")
+ }
+ }
+}
diff --git a/toolkit/test/IntrinsicHistogram.kt b/toolkit/test/IntrinsicHistogram.kt
new file mode 100644
index 0000000..25cc55d
--- /dev/null
+++ b/toolkit/test/IntrinsicHistogram.kt
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicHistogram
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Histogram operation using the RenderScript Intrinsics.
+ */
+fun intrinsicHistogram(
+ context: RenderScript,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d?
+): IntArray {
+ val element = renderScriptVectorElementForU8(context, vectorSize)
+ val scriptHistogram = ScriptIntrinsicHistogram.create(context, element)
+ val builder = Type.Builder(context, element)
+ builder.setX(sizeX)
+ builder.setY(sizeY)
+ val arrayType = builder.create()
+ val inputAllocation = Allocation.createTyped(context, arrayType)
+ val outAllocation =
+ Allocation.createSized(
+ context,
+ renderScriptVectorElementForI32(context, vectorSize),
+ 256
+ )
+ inputAllocation.copyFrom(inputArray)
+ scriptHistogram.setOutput(outAllocation)
+ if (restriction != null) {
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptHistogram.forEach(inputAllocation, options)
+ } else {
+ scriptHistogram.forEach(inputAllocation)
+ }
+
+ val intrinsicOutArray = IntArray(256 * paddedSize(vectorSize))
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ arrayType.destroy()
+ scriptHistogram.destroy()
+ return intrinsicOutArray
+}
+
+fun intrinsicHistogram(
+ context: RenderScript,
+ bitmap: Bitmap,
+ restriction: Range2d?
+): IntArray {
+ val baseElement = renderScriptElementForBitmap(context, bitmap)
+ val scriptHistogram = ScriptIntrinsicHistogram.create(context, baseElement)
+ val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+ inputAllocation.copyFrom(bitmap)
+ val vectorSize = vectorSizeOfBitmap(bitmap)
+ val outAllocation =
+ Allocation.createSized(
+ context,
+ renderScriptVectorElementForI32(context, vectorSize),
+ 256
+ )
+ scriptHistogram.setOutput(outAllocation)
+ if (restriction != null) {
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptHistogram.forEach(inputAllocation, options)
+ } else {
+ scriptHistogram.forEach(inputAllocation)
+ }
+
+ val intrinsicOutArray = IntArray(256 * vectorSize)
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ scriptHistogram.destroy()
+ return intrinsicOutArray
+}
+
+fun intrinsicHistogramDot(
+ context: RenderScript,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ coefficients: FloatArray?,
+ restriction: Range2d?
+): IntArray {
+ val element = renderScriptVectorElementForU8(context, vectorSize)
+ val scriptHistogram = ScriptIntrinsicHistogram.create(context, element)
+ val builder = Type.Builder(context, element)
+ builder.setX(sizeX)
+ builder.setY(sizeY)
+ val arrayType = builder.create()
+ val inputAllocation = Allocation.createTyped(context, arrayType)
+ val outAllocation =
+ Allocation.createSized(context, Element.I32(context), 256)
+ inputAllocation.copyFrom(inputArray)
+
+ if (coefficients != null) {
+ require(coefficients.size == vectorSize) {
+ "RenderScriptToolkit tests. $vectorSize coefficients are required for histogram. " +
+ "${coefficients.size} provided."
+ }
+ scriptHistogram.setDotCoefficients(
+ coefficients[0],
+ if (vectorSize > 1) coefficients[1] else 0f,
+ if (vectorSize > 2) coefficients[2] else 0f,
+ if (vectorSize > 3) coefficients[3] else 0f
+ )
+ }
+ scriptHistogram.setOutput(outAllocation)
+ if (restriction != null) {
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptHistogram.forEach_Dot(inputAllocation, options)
+ } else {
+ scriptHistogram.forEach_Dot(inputAllocation)
+ }
+ val intrinsicOutArray = IntArray(256)
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ arrayType.destroy()
+ scriptHistogram.destroy()
+ return intrinsicOutArray
+}
+
+fun intrinsicHistogramDot(
+ context: RenderScript,
+ bitmap: Bitmap,
+ coefficients: FloatArray?,
+ restriction: Range2d?
+): IntArray {
+ val baseElement = renderScriptElementForBitmap(context, bitmap)
+ val scriptHistogram = ScriptIntrinsicHistogram.create(context, baseElement)
+ val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+ inputAllocation.copyFrom(bitmap)
+ val outAllocation =
+ Allocation.createSized(context, Element.I32(context), 256)
+
+ if (coefficients != null) {
+ require(coefficients.size == 4) {
+ "RenderScriptToolkit tests. Four coefficients are required for histogram. " +
+ "${coefficients.size} provided."
+ }
+ scriptHistogram.setDotCoefficients(
+ coefficients[0],
+ coefficients[1],
+ coefficients[2],
+ coefficients[3]
+ )
+ }
+ scriptHistogram.setOutput(outAllocation)
+ if (restriction != null) {
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptHistogram.forEach_Dot(inputAllocation, options)
+ } else {
+ scriptHistogram.forEach_Dot(inputAllocation)
+ }
+ val intrinsicOutArray = IntArray(256)
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ scriptHistogram.destroy()
+ return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicLut.kt b/toolkit/test/IntrinsicLut.kt
new file mode 100644
index 0000000..1ed03ac
--- /dev/null
+++ b/toolkit/test/IntrinsicLut.kt
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicLUT
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a LookUpTable operation using the RenderScript Intrinsics.
+ */
+@ExperimentalUnsignedTypes
+fun intrinsicLut(
+ context: RenderScript,
+ inputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ newRed: ByteArray,
+ newGreen: ByteArray,
+ newBlue: ByteArray,
+ newAlpha: ByteArray,
+ restriction: Range2d?
+): ByteArray {
+ val scriptLut: ScriptIntrinsicLUT = ScriptIntrinsicLUT.create(
+ context,
+ Element.U8_4(context)
+ )
+ val builder = Type.Builder(context, Element.U8_4(context))
+ builder.setX(sizeX)
+ builder.setY(sizeY)
+ val arrayType = builder.create()
+ val inputAllocation = Allocation.createTyped(context, arrayType)
+ val outAllocation = Allocation.createTyped(context, arrayType)
+ inputAllocation.copyFrom(inputArray)
+ val intrinsicOutArray = ByteArray(sizeX * sizeY * 4)
+
+ for (v in 0..255) {
+ scriptLut.setRed(v, newRed[v].toUByte().toInt())
+ scriptLut.setGreen(v, newGreen[v].toUByte().toInt())
+ scriptLut.setBlue(v, newBlue[v].toUByte().toInt())
+ scriptLut.setAlpha(v, newAlpha[v].toUByte().toInt())
+ }
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptLut.forEach(inputAllocation, outAllocation, options)
+ } else {
+ scriptLut.forEach(inputAllocation, outAllocation)
+ }
+
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ arrayType.destroy()
+ scriptLut.destroy()
+ return intrinsicOutArray
+}
+
+@ExperimentalUnsignedTypes
+fun intrinsicLut(
+ context: RenderScript,
+ bitmap: Bitmap,
+ newRed: ByteArray,
+ newGreen: ByteArray,
+ newBlue: ByteArray,
+ newAlpha: ByteArray,
+ restriction: Range2d?
+): ByteArray {
+ val baseElement = renderScriptElementForBitmap(context, bitmap)
+ val scriptLut: ScriptIntrinsicLUT = ScriptIntrinsicLUT.create(context, baseElement)
+ val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+ inputAllocation.copyFrom(bitmap)
+ val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+ val intrinsicOutArray = ByteArray(bitmap.byteCount)
+
+ for (v in 0..255) {
+ scriptLut.setRed(v, newRed[v].toUByte().toInt())
+ scriptLut.setGreen(v, newGreen[v].toUByte().toInt())
+ scriptLut.setBlue(v, newBlue[v].toUByte().toInt())
+ scriptLut.setAlpha(v, newAlpha[v].toUByte().toInt())
+ }
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptLut.forEach(inputAllocation, outAllocation, options)
+ } else {
+ scriptLut.forEach(inputAllocation, outAllocation)
+ }
+
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ scriptLut.destroy()
+ return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicLut3d.kt b/toolkit/test/IntrinsicLut3d.kt
new file mode 100644
index 0000000..48e785e
--- /dev/null
+++ b/toolkit/test/IntrinsicLut3d.kt
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsic3DLUT
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a 3D LookUpTable operation using the RenderScript Intrinsics.
+ */
+fun intrinsicLut3d(
+ context: RenderScript,
+ inputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ cubeArray: ByteArray,
+ cubeSize: Dimension,
+ restriction: Range2d?
+): ByteArray {
+ val scriptLut3d: ScriptIntrinsic3DLUT = ScriptIntrinsic3DLUT.create(
+ context, Element.U8_4(
+ context
+ )
+ )
+ val builder = Type.Builder(context, Element.U8_4(context))
+ builder.setX(sizeX)
+ builder.setY(sizeY)
+ val arrayType = builder.create()
+ val inputAllocation = Allocation.createTyped(context, arrayType)
+ val outAllocation = Allocation.createTyped(context, arrayType)
+ inputAllocation.copyFrom(inputArray)
+ val intrinsicOutArray = ByteArray(sizeX * sizeY * 4)
+
+ val cubeTypeBuilder: Type.Builder =
+ Type.Builder(context, Element.U8_4(context))
+ cubeTypeBuilder.setX(cubeSize.sizeX)
+ cubeTypeBuilder.setY(cubeSize.sizeY)
+ cubeTypeBuilder.setZ(cubeSize.sizeZ)
+ val cubeType: Type = cubeTypeBuilder.create()
+ val cubeAllocation = Allocation.createTyped(context, cubeType)
+ cubeAllocation.copyFrom(cubeArray)
+ scriptLut3d.setLUT(cubeAllocation)
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptLut3d.forEach(inputAllocation, outAllocation, options)
+ } else {
+ scriptLut3d.forEach(inputAllocation, outAllocation)
+ }
+
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ cubeAllocation.destroy()
+ arrayType.destroy()
+ cubeType.destroy()
+ scriptLut3d.destroy()
+ return intrinsicOutArray
+}
+
+fun intrinsicLut3d(
+ context: RenderScript,
+ bitmap: Bitmap,
+ cubeArray: ByteArray,
+ cubeSize: Dimension,
+ restriction: Range2d?
+): ByteArray {
+ val baseElement = renderScriptElementForBitmap(context, bitmap)
+ val scriptLut3d: ScriptIntrinsic3DLUT = ScriptIntrinsic3DLUT.create(context, baseElement)
+ val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+ inputAllocation.copyFrom(bitmap)
+ val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+ val intrinsicOutArray = ByteArray(bitmap.byteCount)
+
+ val cubeTypeBuilder: Type.Builder =
+ Type.Builder(context, Element.U8_4(context))
+ cubeTypeBuilder.setX(cubeSize.sizeX)
+ cubeTypeBuilder.setY(cubeSize.sizeY)
+ cubeTypeBuilder.setZ(cubeSize.sizeZ)
+ val cubeType: Type = cubeTypeBuilder.create()
+ val cubeAllocation = Allocation.createTyped(context, cubeType)
+ cubeAllocation.copyFrom(cubeArray)
+ scriptLut3d.setLUT(cubeAllocation)
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptLut3d.forEach(inputAllocation, outAllocation, options)
+ } else {
+ scriptLut3d.forEach(inputAllocation, outAllocation)
+ }
+
+ outAllocation.copyTo(intrinsicOutArray)
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ cubeAllocation.destroy()
+ cubeType.destroy()
+ scriptLut3d.destroy()
+ return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicResize.kt b/toolkit/test/IntrinsicResize.kt
new file mode 100644
index 0000000..5cdf89a
--- /dev/null
+++ b/toolkit/test/IntrinsicResize.kt
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicResize
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Resize operation using the RenderScript Intrinsics.
+ */
+fun intrinsicResize(
+ context: RenderScript,
+ inputArray: ByteArray,
+ vectorSize: Int,
+ inSizeX: Int,
+ inSizeY: Int,
+ outSizeX: Int,
+ outSizeY: Int,
+ restriction: Range2d?
+): ByteArray {
+ val scriptResize = ScriptIntrinsicResize.create(context)
+ val builder = Type.Builder(
+ context,
+ renderScriptVectorElementForU8(context, vectorSize)
+ )
+ builder.setX(inSizeX)
+ builder.setY(inSizeY)
+ val inputArrayType = builder.create()
+ val inputAllocation = Allocation.createTyped(context, inputArrayType)
+ builder.setX(outSizeX)
+ builder.setY(outSizeY)
+ val outputArrayType = builder.create()
+ val outAllocation = Allocation.createTyped(context, outputArrayType)
+ val intrinsicOutArray = ByteArray(outSizeX * outSizeY * paddedSize(vectorSize))
+
+ inputAllocation.copyFrom(inputArray)
+ scriptResize.setInput(inputAllocation)
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptResize.forEach_bicubic(outAllocation, options)
+ } else {
+ scriptResize.forEach_bicubic(outAllocation)
+ }
+ outAllocation.copyTo(intrinsicOutArray)
+
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ scriptResize.destroy()
+ inputArrayType.destroy()
+ outputArrayType.destroy()
+ return intrinsicOutArray
+}
+
+fun intrinsicResize(
+ context: RenderScript,
+ bitmap: Bitmap,
+ outSizeX: Int,
+ outSizeY: Int,
+ restriction: Range2d?
+): ByteArray {
+ val scriptResize = ScriptIntrinsicResize.create(context)
+ val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+ inputAllocation.copyFrom(bitmap)
+
+ val vectorSize = when (bitmap.config) {
+ Bitmap.Config.ARGB_8888 -> 4
+ Bitmap.Config.ALPHA_8 -> 1
+ else -> error("Unrecognized bitmap config $bitmap.config")
+ }
+ val builder = Type.Builder(
+ context,
+ renderScriptVectorElementForU8(context, vectorSize)
+ )
+ builder.setX(outSizeX)
+ builder.setY(outSizeY)
+ val outputArrayType = builder.create()
+ val outAllocation = Allocation.createTyped(context, outputArrayType)
+ val intrinsicOutArray = ByteArray(outSizeX * outSizeY * vectorSize)
+
+ scriptResize.setInput(inputAllocation)
+ if (restriction != null) {
+ outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+ val options = Script.LaunchOptions()
+ options.setX(restriction.startX, restriction.endX)
+ options.setY(restriction.startY, restriction.endY)
+ scriptResize.forEach_bicubic(outAllocation, options)
+ } else {
+ scriptResize.forEach_bicubic(outAllocation)
+ }
+ outAllocation.copyTo(intrinsicOutArray)
+
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ outputArrayType.destroy()
+ scriptResize.destroy()
+ return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicYuvToRgb.kt b/toolkit/test/IntrinsicYuvToRgb.kt
new file mode 100644
index 0000000..5e46f2e
--- /dev/null
+++ b/toolkit/test/IntrinsicYuvToRgb.kt
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.ImageFormat
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.ScriptIntrinsicYuvToRGB
+import android.renderscript.Type
+import android.renderscript.toolkit.YuvFormat
+
+/**
+ * Does a YUV to RGB operation using the RenderScript Intrinsics.
+ */
+fun intrinsicYuvToRgb(
+ context: RenderScript,
+ inputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ format: YuvFormat
+): ByteArray {
+ val scriptYuvToRgb = ScriptIntrinsicYuvToRGB.create(
+ context,
+ Element.YUV(context)
+ )
+ val inputBuilder = Type.Builder(context, Element.YUV(context))
+ inputBuilder.setX(sizeX)
+ inputBuilder.setY(sizeY)
+ when (format) {
+ YuvFormat.NV21 -> inputBuilder.setYuvFormat(ImageFormat.NV21)
+ YuvFormat.YV12 -> inputBuilder.setYuvFormat(ImageFormat.YV12)
+ else -> require(false) { "Unknown YUV format $format" }
+ }
+ val inputArrayType = inputBuilder.create()
+ val inputAllocation = Allocation.createTyped(context, inputArrayType)
+
+ val outputBuilder = Type.Builder(context, Element.U8_4(context))
+ outputBuilder.setX(sizeX)
+ outputBuilder.setY(sizeY)
+ val outputArrayType = outputBuilder.create()
+ val outAllocation = Allocation.createTyped(context, outputArrayType)
+ val intrinsicOutArray = ByteArray(sizeX * sizeY * 4)
+
+ inputAllocation.copyFrom(inputArray)
+ scriptYuvToRgb.setInput(inputAllocation)
+ scriptYuvToRgb.forEach(outAllocation)
+ outAllocation.copyTo(intrinsicOutArray)
+
+ inputAllocation.destroy()
+ outAllocation.destroy()
+ inputArrayType.destroy()
+ outputArrayType.destroy()
+ scriptYuvToRgb.destroy()
+ return intrinsicOutArray
+}
diff --git a/toolkit/test/MainActivity.kt b/toolkit/test/MainActivity.kt
new file mode 100644
index 0000000..4092861
--- /dev/null
+++ b/toolkit/test/MainActivity.kt
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.os.Bundle
+import android.widget.TextView
+import androidx.appcompat.app.AppCompatActivity
+
+@ExperimentalUnsignedTypes
+class MainActivity : AppCompatActivity() {
+
+ override fun onCreate(savedInstanceState: Bundle?) {
+ super.onCreate(savedInstanceState)
+ setContentView(R.layout.activity_main)
+
+ // To debug resources not destroyed
+ // "A resource failed to call destroy."
+ try {
+ Class.forName("dalvik.system.CloseGuard")
+ .getMethod("setEnabled", Boolean::class.javaPrimitiveType)
+ .invoke(null, true)
+ } catch (e: ReflectiveOperationException) {
+ throw RuntimeException(e)
+ }
+
+ val validate = true
+ val tester = Tester(this, validate)
+ val numberOfIterations = if (validate) 1 else 28
+ val t = TimingTracker(numberOfIterations, 0)
+ for (i in 1..numberOfIterations) {
+ println("*** Iteration $i of $numberOfIterations ****")
+ //startMethodTracing("myTracing")
+ //startMethodTracingSampling("myTracing_sample", 8000000, 10)
+ val r = tester.testAll(t)
+ //stopMethodTracing()
+ findViewById<TextView>(R.id.sample_text).text = "$r\n\n${t.report()}"
+ t.nextIteration()
+ }
+ tester.destroy()
+ }
+}
diff --git a/toolkit/test/ReferenceBlend.kt b/toolkit/test/ReferenceBlend.kt
new file mode 100644
index 0000000..ba60bc8
--- /dev/null
+++ b/toolkit/test/ReferenceBlend.kt
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.BlendingMode
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a Blend operation.
+ *
+ * See the class Rgba for details of arithmetic operation using that class.
+ */
+@ExperimentalUnsignedTypes
+fun referenceBlend(
+ mode: BlendingMode,
+ sourceArray: ByteArray,
+ destArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d?
+) {
+ val source = Rgba2dArray(sourceArray, sizeX, sizeY)
+ val dest = Rgba2dArray(destArray, sizeX, sizeY)
+
+ /**
+ * For each corresponding RGBA value of the source and destination arrays, invoke the blend
+ * function and store the result in the destination array.
+ */
+ fun blendEachPair(blendFunction: (src: Rgba, dst: Rgba) -> Rgba) {
+ dest.forEachCell(restriction) { x, y ->
+ dest[x, y] = blendFunction(source[x, y], dest[x, y])
+ }
+ }
+
+ when (mode) {
+ BlendingMode.CLEAR -> blendEachPair { _, _ -> Rgba(0, 0, 0, 0) }
+ BlendingMode.SRC -> blendEachPair { src, _ -> src }
+ BlendingMode.DST -> { /* This doesn't do anything. */ }
+ BlendingMode.SRC_OVER -> blendEachPair { src, dst -> blendOver(src, dst) }
+ BlendingMode.DST_OVER -> blendEachPair { src, dst -> blendOver(dst, src) }
+ BlendingMode.SRC_IN -> blendEachPair { src, dst -> blendIn(src, dst) }
+ BlendingMode.DST_IN -> blendEachPair { src, dst -> blendIn(dst, src) }
+ BlendingMode.SRC_OUT -> blendEachPair { src, dst -> blendOut(src, dst) }
+ BlendingMode.DST_OUT -> blendEachPair { src, dst -> blendOut(dst, src) }
+ BlendingMode.SRC_ATOP -> blendEachPair { src, dst -> blendAtop(src, dst) }
+ BlendingMode.DST_ATOP -> blendEachPair { src, dst -> blendAtop(dst, src) }
+ BlendingMode.XOR -> blendEachPair { src, dst -> src xor dst }
+ BlendingMode.MULTIPLY -> blendEachPair { src, dst -> src * dst }
+ BlendingMode.ADD -> blendEachPair { src, dst -> dst + src }
+ BlendingMode.SUBTRACT -> blendEachPair { src, dst -> dst - src }
+ }
+}
+
+@ExperimentalUnsignedTypes
+private fun blendOver(src: Rgba, dst: Rgba) = src + (dst * (255 - src.a))
+
+@ExperimentalUnsignedTypes
+private fun blendIn(src: Rgba, dst: Rgba) = src * dst.a
+
+@ExperimentalUnsignedTypes
+private fun blendOut(src: Rgba, dst: Rgba) = src * (255 - dst.a)
+
+@ExperimentalUnsignedTypes
+private fun blendAtop(src: Rgba, dst: Rgba): Rgba {
+ val value = src * dst.a + dst * (255 - src.a)
+ value.a = dst.a
+ return value
+}
diff --git a/toolkit/test/ReferenceBlur.kt b/toolkit/test/ReferenceBlur.kt
new file mode 100644
index 0000000..66c2a05
--- /dev/null
+++ b/toolkit/test/ReferenceBlur.kt
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+import kotlin.math.max
+import kotlin.math.min
+import kotlin.math.pow
+import kotlin.math.sqrt
+
+/**
+ * Reference implementation of a Blur operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceBlur(inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ radius: Int = 5, restriction: Range2d?): ByteArray {
+ val maxRadius = 25
+ require (radius in 1..maxRadius) {
+ "RenderScriptToolkit blur. Radius should be between 1 and $maxRadius. $radius provided."
+ }
+ val gaussian = buildGaussian(radius)
+
+ // Convert input data to float so that the blurring goes faster.
+ val inputValues = FloatArray(inputArray.size) { byteToUnitFloat(inputArray[it].toUByte()) }
+ val inputInFloat = FloatVector2dArray(inputValues, vectorSize, sizeX, sizeY)
+
+ val scratch = horizontalBlur(inputInFloat, gaussian, radius, restriction)
+ val outInFloat = verticalBlur(scratch, gaussian, radius, restriction)
+
+ // Convert the results back to bytes.
+ return ByteArray(outInFloat.values.size) { unitFloatClampedToUByte(outInFloat.values[it]).toByte() }
+}
+
+/**
+ * Blurs along the horizontal direction using the specified gaussian weights.
+ */
+private fun horizontalBlur(
+ input: FloatVector2dArray,
+ gaussian: FloatArray,
+ radius: Int,
+ restriction: Range2d?
+): FloatVector2dArray {
+ var expandedRestriction: Range2d? = null
+ if (restriction != null) {
+ // Expand the restriction in the vertical direction so that the vertical pass
+ // will have all the data it needs.
+ expandedRestriction = Range2d(
+ restriction.startX,
+ restriction.endX,
+ max(restriction.startY - radius, 0),
+ min(restriction.endY + radius, input.sizeY)
+ )
+ }
+
+ input.clipAccessToRange = true
+ val out = input.createSameSized()
+ out.forEach(expandedRestriction) { x, y ->
+ for ((gaussianIndex, delta: Int) in (-radius..radius).withIndex()) {
+ val v = input[x + delta, y] * gaussian[gaussianIndex]
+ out[x, y] += v
+ }
+ }
+ return out
+}
+
+/**
+ * Blurs along the horizontal direction using the specified gaussian weights.
+ */
+private fun verticalBlur(
+ input: FloatVector2dArray,
+ gaussian: FloatArray,
+ radius: Int,
+ restriction: Range2d?
+): FloatVector2dArray {
+ input.clipAccessToRange = true
+ val out = input.createSameSized()
+ out.forEach(restriction) { x, y ->
+ for ((gaussianIndex, delta: Int) in (-radius..radius).withIndex()) {
+ val v = input[x, y + delta] * gaussian[gaussianIndex]
+ out[x, y] += v
+ }
+ }
+ return out
+}
+
+/**
+ * Builds an array of gaussian weights that will be used for doing the horizontal and vertical
+ * blur.
+ *
+ * @return An array of (2 * radius + 1) floats.
+ */
+private fun buildGaussian(radius: Int): FloatArray {
+ val e: Float = kotlin.math.E.toFloat()
+ val pi: Float = kotlin.math.PI.toFloat()
+ val sigma: Float = 0.4f * radius.toFloat() + 0.6f
+ val coefficient1: Float = 1.0f / (sqrt(2.0f * pi) * sigma)
+ val coefficient2: Float = -1.0f / (2.0f * sigma * sigma)
+
+ var sum = 0.0f
+ val gaussian = FloatArray(radius * 2 + 1)
+ for (r in -radius..radius) {
+ val floatR: Float = r.toFloat()
+ val v: Float = coefficient1 * e.pow(floatR * floatR * coefficient2)
+ gaussian[r + radius] = v
+ sum += v
+ }
+
+ // Normalize so that the sum of the weights equal 1f.
+ val normalizeFactor: Float = 1.0f / sum
+ for (r in -radius..radius) {
+ gaussian[r + radius] *= normalizeFactor
+ }
+ return gaussian
+}
diff --git a/toolkit/test/ReferenceColorMatrix.kt b/toolkit/test/ReferenceColorMatrix.kt
new file mode 100644
index 0000000..75f93af
--- /dev/null
+++ b/toolkit/test/ReferenceColorMatrix.kt
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a ColorMatrix operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceColorMatrix(inputArray: ByteArray,
+ inputVectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ outputVectorSize: Int,
+ matrix: FloatArray, addVector: FloatArray,
+ restriction: Range2d?): ByteArray {
+ require (matrix.size == 16) { "RenderScriptToolkit colorMatrix. Matrix should have 16 values. ${matrix.size} provided." }
+
+ val input = Vector2dArray(inputArray.asUByteArray(), inputVectorSize, sizeX, sizeY)
+ val outputArray = ByteArray(sizeX * sizeY * paddedSize(outputVectorSize))
+ val output = Vector2dArray(outputArray.asUByteArray(), outputVectorSize, sizeX, sizeY)
+
+ output.forEach (restriction) { x, y ->
+ val inUByteValue = input[x, y]
+ val inFloatValue = FloatArray(4) { if (it >= inputVectorSize) 0f else byteToUnitFloat(inUByteValue[it]) }
+ val outFloatValue = multiplyAndAdd(matrix, inFloatValue, addVector)
+ val outUByteValue = UByteArray(paddedSize(output.vectorSize)) { unitFloatClampedToUByte(outFloatValue[it]) }
+ output[x, y] = outUByteValue
+ }
+ return outputArray
+}
+
+private fun multiplyAndAdd(matrix: FloatArray, inVector: FloatArray, addVector: FloatArray): FloatArray {
+ // In RenderScript, matrix were set in column major format
+ val result = addVector.clone()
+ for (i in 0..3) {
+ for (j in 0..3) {
+ result[i] += matrix[j * 4 + i] * inVector[j]
+ }
+ }
+ return result
+}
diff --git a/toolkit/test/ReferenceConvolve.kt b/toolkit/test/ReferenceConvolve.kt
new file mode 100644
index 0000000..b9181a9
--- /dev/null
+++ b/toolkit/test/ReferenceConvolve.kt
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a Convolve operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceConvolve(
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ coefficients: FloatArray,
+ restriction: Range2d?
+): ByteArray {
+ val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, sizeX, sizeY)
+ val radius = when (coefficients.size) {
+ 9 -> 1
+ 25 -> 2
+ else -> {
+ throw IllegalArgumentException("RenderScriptToolkit Convolve. Only 3x3 and 5x5 convolutions are supported. ${coefficients.size} coefficients provided.")
+ }
+ }
+
+ input.clipReadToRange = true
+ val output = input.createSameSized()
+ input.forEach(restriction) { x, y ->
+ output[x, y] = convolveOne(input, x, y, coefficients, radius)
+ }
+ return output.values.asByteArray()
+}
+
+@ExperimentalUnsignedTypes
+private fun convolveOne(
+ inputAlloc: Vector2dArray,
+ x: Int,
+ y: Int,
+ coefficients: FloatArray,
+ radius: Int
+): UByteArray {
+ var sum = FloatArray(paddedSize(inputAlloc.vectorSize))
+ var coefficientIndex = 0
+ for (deltaY in -radius..radius) {
+ for (deltaX in -radius..radius) {
+ val inputVector = inputAlloc[x + deltaX, y + deltaY]
+ sum += inputVector.toFloatArray() * coefficients[coefficientIndex]
+ coefficientIndex++
+ }
+ }
+ return sum.clampToUByte()
+}
diff --git a/toolkit/test/ReferenceHistogram.kt b/toolkit/test/ReferenceHistogram.kt
new file mode 100644
index 0000000..6bd9167
--- /dev/null
+++ b/toolkit/test/ReferenceHistogram.kt
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a Histogram operation.
+ *
+ * Return an array of 4 * 256 ints.
+ * Position 0 is the number of R with a value of 0,
+ * Position 1 is the number of G with a value of 0,
+ * Position 2 is the number of B with a value of 0,
+ * Position 3 is the number of A with a value of 0,
+ * Position 4 is the number of R with a value of 1,
+ * etc.
+*/
+@ExperimentalUnsignedTypes
+fun referenceHistogram(
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ restriction: Range2d?
+): IntArray {
+ val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, sizeX, sizeY)
+
+ val counts = IntArray(paddedSize(input.vectorSize) * 256)
+ input.forEach(restriction) { x, y ->
+ val value = input[x, y]
+ for (i in 0 until vectorSize) {
+ counts[value[i].toInt() * paddedSize(input.vectorSize) + i]++
+ }
+ }
+ return counts
+}
+
+/**
+ * Reference implementation of a HistogramDot operation.
+ *
+ * Each RGBA input value is dot-multiplied first by the specified coefficients.
+ * The resulting value is converted to an integer and used for the histogram.
+ */
+@ExperimentalUnsignedTypes
+fun referenceHistogramDot(
+ inputArray: ByteArray,
+ vectorSize: Int,
+ sizeX: Int,
+ sizeY: Int,
+ coefficients: FloatArray?,
+ restriction: Range2d?
+): IntArray {
+ val floatCoefficients = coefficients ?: floatArrayOf(0.299f, 0.587f, 0.114f, 0f)
+ val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, sizeX, sizeY)
+ var coefficientSum = 0f
+ for (c in floatCoefficients) {
+ require (c >= 0) {
+ "RenderScriptToolkit histogramDot. Coefficients must be positive. $c provided."
+ }
+ coefficientSum += c
+ }
+ require(coefficientSum <= 1f) { "RenderScriptToolkit histogramDot. Coefficients should " +
+ "add to 1.0 or less. $coefficientSum provided." }
+
+ // Compute integer
+ val intCoefficients = IntArray(input.vectorSize) { (floatCoefficients[it] * 256f + 0.5f).toInt() }
+
+ val counts = IntArray(256)
+ input.forEach(restriction) { x, y ->
+ val value = input[x, y]
+ // While we could do the computation using floats, we won't get the same results as
+ // the existing intrinsics.
+ var sum = 0
+ // We don't use value.indices because we want to accumulate only 3 values, in the case
+ // of vectorSize == 3.
+ for (i in 0 until vectorSize) {
+ sum += intCoefficients[i] * value[i].toInt()
+ }
+ // Round up and normalize
+ val index = (sum + 0x7f) shr 8
+ counts[index]++
+ }
+ return counts
+}
diff --git a/toolkit/test/ReferenceLut.kt b/toolkit/test/ReferenceLut.kt
new file mode 100644
index 0000000..cd832f0
--- /dev/null
+++ b/toolkit/test/ReferenceLut.kt
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.LookupTable
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a LookUpTable operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceLut(
+ inputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ table: LookupTable,
+ restriction: Range2d?
+): ByteArray {
+ val input = Vector2dArray(inputArray.asUByteArray(), 4, sizeX, sizeY)
+
+ val output = input.createSameSized()
+ input.forEach(restriction) { x, y ->
+ val oldValue = input[x, y]
+ val newValue = byteArrayOf(
+ table.red[oldValue[0].toInt()],
+ table.green[oldValue[1].toInt()],
+ table.blue[oldValue[2].toInt()],
+ table.alpha[oldValue[3].toInt()]
+ )
+ output[x, y] = newValue.asUByteArray()
+ }
+ return output.values.asByteArray()
+}
+
diff --git a/toolkit/test/ReferenceLut3d.kt b/toolkit/test/ReferenceLut3d.kt
new file mode 100644
index 0000000..afd977b
--- /dev/null
+++ b/toolkit/test/ReferenceLut3d.kt
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+import android.renderscript.toolkit.Rgba3dArray
+
+/**
+ * Reference implementation of a 3D LookUpTable operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceLut3d(
+ inputArray: ByteArray,
+ sizeX: Int,
+ sizeY: Int,
+ cube: Rgba3dArray,
+ restriction: Range2d?
+): ByteArray {
+ val input = Vector2dArray(inputArray.asUByteArray(), 4, sizeX, sizeY)
+ val output = input.createSameSized()
+ input.forEach(restriction) { x, y ->
+ output[x, y] = lookup(input[x, y], cube)
+ }
+ return output.values.asByteArray()
+}
+
+@ExperimentalUnsignedTypes
+private fun lookup(input: UByteArray, cube: Rgba3dArray): UByteArray {
+ // Calculate the two points at opposite edges of the size 1
+ // cube that contains our point.
+ val maxIndex = Int4(cube.sizeX - 1, cube.sizeY - 1, cube.sizeZ - 1, 0)
+ val baseCoordinate: Float4 = input.toFloat4() * maxIndex.toFloat4() / 255f
+ val point1: Int4 = baseCoordinate.intFloor()
+ val point2: Int4 = min(point1 + 1, maxIndex)
+ val fractionAwayFromPoint1: Float4 = baseCoordinate - point1.toFloat4()
+
+ // Get the RGBA values at each of the four corners of the size 1 cube.
+ val v000 = cube[point1.x, point1.y, point1.z].toFloat4()
+ val v100 = cube[point2.x, point1.y, point1.z].toFloat4()
+ val v010 = cube[point1.x, point2.y, point1.z].toFloat4()
+ val v110 = cube[point2.x, point2.y, point1.z].toFloat4()
+ val v001 = cube[point1.x, point1.y, point2.z].toFloat4()
+ val v101 = cube[point2.x, point1.y, point2.z].toFloat4()
+ val v011 = cube[point1.x, point2.y, point2.z].toFloat4()
+ val v111 = cube[point2.x, point2.y, point2.z].toFloat4()
+
+ // Do the linear mixing of these eight values.
+ val yz00 = mix(v000, v100, fractionAwayFromPoint1.x)
+ val yz10 = mix(v010, v110, fractionAwayFromPoint1.x)
+ val yz01 = mix(v001, v101, fractionAwayFromPoint1.x)
+ val yz11 = mix(v011, v111, fractionAwayFromPoint1.x)
+
+ val z0 = mix(yz00, yz10, fractionAwayFromPoint1.y)
+ val z1 = mix(yz01, yz11, fractionAwayFromPoint1.y)
+
+ val v = mix(z0, z1, fractionAwayFromPoint1.z)
+
+ // Preserve the alpha of the original value
+ return ubyteArrayOf(v.x.clampToUByte(), v.y.clampToUByte(), v.z.clampToUByte(), input[3])
+}
diff --git a/toolkit/test/ReferenceResize.kt b/toolkit/test/ReferenceResize.kt
new file mode 100644
index 0000000..023825e
--- /dev/null
+++ b/toolkit/test/ReferenceResize.kt
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+import kotlin.math.floor
+import kotlin.math.max
+
+var trace = false
+
+/**
+ * Reference implementation of a Resize operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceResize(inputArray: ByteArray,
+ vectorSize: Int,
+ inSizeX: Int,
+ inSizeY: Int,
+ outSizeX: Int, outSizeY: Int,
+ restriction: Range2d?): ByteArray {
+ val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, inSizeX, inSizeY)
+ val scaleX: Float = input.sizeX.toFloat() / outSizeX.toFloat()
+ val scaleY: Float = input.sizeY.toFloat() / outSizeY.toFloat()
+ val outArray = UByteArray(outSizeX * outSizeY * paddedSize(input.vectorSize))
+ val out = Vector2dArray(outArray, input.vectorSize, outSizeX, outSizeY)
+ out.forEach (restriction) { x, y ->
+ if (x == 1827 && y == 46) {
+ println("Found it")
+ trace = true
+ }
+ val o = bicubicU4(x, y, input, scaleX, scaleY)
+ out[x, y] = o.clampToUByte()
+ }
+ return out.values.asByteArray()
+}
+
+private fun cubicInterpolateF(p0: FloatArray, p1: FloatArray, p2: FloatArray, p3: FloatArray,
+ x: Float): FloatArray {
+ return p1 + (p2 - p0 + (p0 * 2f - p1 * 5f + p2 * 4f - p3
+ + ((p1 - p2) * 3f + p3 - p0) * x) * x) * x * 0.5f
+}
+
+@ExperimentalUnsignedTypes
+private fun bicubicU4(x: Int, y: Int, gIn: Vector2dArray, scaleX: Float, scaleY: Float): FloatArray {
+ var xf: Float = (x + 0.5f) * scaleX - 0.5f
+ var yf: Float = (y + 0.5f) * scaleY - 0.5f
+
+ val startX: Int = floor(xf - 1).toInt()
+ val startY: Int = floor(yf - 1).toInt()
+ xf -= floor(xf)
+ yf -= floor(yf)
+ val maxX: Int = gIn.sizeX - 1
+ val maxY: Int = gIn.sizeY - 1
+
+ val xs0: Int = max(0, startX + 0)
+ val xs1: Int = max(0, startX + 1)
+ val xs2: Int = kotlin.math.min(maxX, startX + 2)
+ val xs3: Int = kotlin.math.min(maxX, startX + 3)
+
+ val ys0: Int = max(0, startY + 0)
+ val ys1: Int = max(0, startY + 1)
+ val ys2: Int = kotlin.math.min(maxY, startY + 2)
+ val ys3: Int = kotlin.math.min(maxY, startY + 3)
+
+ val p00 = gIn[xs0, ys0].toFloatArray()
+ val p01 = gIn[xs1, ys0].toFloatArray()
+ val p02 = gIn[xs2, ys0].toFloatArray()
+ val p03 = gIn[xs3, ys0].toFloatArray()
+ val p0 = cubicInterpolateF(p00, p01, p02, p03, xf)
+
+ val p10 = gIn[xs0, ys1].toFloatArray()
+ val p11 = gIn[xs1, ys1].toFloatArray()
+ val p12 = gIn[xs2, ys1].toFloatArray()
+ val p13 = gIn[xs3, ys1].toFloatArray()
+ val p1 = cubicInterpolateF(p10, p11, p12, p13, xf)
+
+ val p20 = gIn[xs0, ys2].toFloatArray()
+ val p21 = gIn[xs1, ys2].toFloatArray()
+ val p22 = gIn[xs2, ys2].toFloatArray()
+ val p23 = gIn[xs3, ys2].toFloatArray()
+ val p2 = cubicInterpolateF(p20, p21, p22, p23, xf)
+
+ val p30 = gIn[xs0, ys3].toFloatArray()
+ val p31 = gIn[xs1, ys3].toFloatArray()
+ val p32 = gIn[xs2, ys3].toFloatArray()
+ val p33 = gIn[xs3, ys3].toFloatArray()
+ val p3 = cubicInterpolateF(p30, p31, p32, p33, xf)
+
+ return cubicInterpolateF(p0, p1, p2, p3, yf)
+}
+
+
+/* To be used if we implement Floats
+private fun bicubic_F4(x: Int, y: Int, gin: ByteArray, sizeX: Int, sizeY: Int, scaleX: Float, scaleY: Float): Float4 {
+ var xf: Float = (x + 0.5f) * scaleX - 0.5f
+ var yf: Float = (y + 0.5f) * scaleY - 0.5f
+
+ val startX: Int = floor(xf - 1).toInt()
+ val startY: Int = floor(yf - 1).toInt()
+ xf = xf - floor(xf)
+ yf = yf - floor(yf)
+ val maxX: Int = sizeX - 1
+ val maxY: Int = sizeY - 1
+
+ val xs0: Int = max(0, startX + 0)
+ val xs1: Int = max(0, startX + 1)
+ val xs2: Int = min(maxX, startX + 2)
+ val xs3: Int = min(maxX, startX + 3)
+
+ val ys0: Int = max(0, startY + 0)
+ val ys1: Int = max(0, startY + 1)
+ val ys2: Int = min(maxY, startY + 2)
+ val ys3: Int = min(maxY, startY + 3)
+
+ val p00: Float4 = rsGetElementAt_Float4(gIn, xs0, ys0)
+ val p01: Float4 = rsGetElementAt_Float4(gIn, xs1, ys0)
+ val p02: Float4 = rsGetElementAt_Float4(gIn, xs2, ys0)
+ val p03: Float4 = rsGetElementAt_Float4(gIn, xs3, ys0)
+ val p0: Float4 = cubicInterpolate_F4(p00, p01, p02, p03, xf)
+
+ val p10: Float4 = rsGetElementAt_Float4(gIn, xs0, ys1)
+ val p11: Float4 = rsGetElementAt_Float4(gIn, xs1, ys1)
+ val p12: Float4 = rsGetElementAt_Float4(gIn, xs2, ys1)
+ val p13: Float4 = rsGetElementAt_Float4(gIn, xs3, ys1)
+ val p1: Float4 = cubicInterpolate_F4(p10, p11, p12, p13, xf)
+
+ val p20: Float4 = rsGetElementAt_Float4(gIn, xs0, ys2)
+ val p21: Float4 = rsGetElementAt_Float4(gIn, xs1, ys2)
+ val p22: Float4 = rsGetElementAt_Float4(gIn, xs2, ys2)
+ val p23: Float4 = rsGetElementAt_Float4(gIn, xs3, ys2)
+ val p2: Float4 = cubicInterpolate_F4(p20, p21, p22, p23, xf)
+
+ val p30: Float4 = rsGetElementAt_Float4(gIn, xs0, ys3)
+ val p31: Float4 = rsGetElementAt_Float4(gIn, xs1, ys3)
+ val p32: Float4 = rsGetElementAt_Float4(gIn, xs2, ys3)
+ val p33: Float4 = rsGetElementAt_Float4(gIn, xs3, ys3)
+ val p3: Float4 = cubicInterpolate_F4(p30, p31, p32, p33, xf)
+
+ val p: Float4 = cubicInterpolate_F4(p0, p1, p2, p3, yf)
+
+ return p
+}
+*/
diff --git a/toolkit/test/ReferenceYuvToRgb.kt b/toolkit/test/ReferenceYuvToRgb.kt
new file mode 100644
index 0000000..4d91cf6
--- /dev/null
+++ b/toolkit/test/ReferenceYuvToRgb.kt
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.YuvFormat
+import java.lang.IllegalArgumentException
+
+/**
+ * Reference implementation of a YUV to RGB operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceYuvToRgb(inputSignedArray: ByteArray, sizeX: Int, sizeY: Int, format: YuvFormat): ByteArray {
+ require(sizeX % 2 == 0) { "The width of the input should be even."}
+ val inputArray = inputSignedArray.asUByteArray()
+
+ val outputArray = ByteArray(sizeX * sizeY * 4)
+ val output = Vector2dArray(outputArray.asUByteArray(), 4, sizeX, sizeY)
+
+ when (format) {
+ YuvFormat.NV21 -> {
+ val startY = 0
+ val startU = sizeX * sizeY + 1
+ val startV = sizeX * sizeY
+
+ for (y in 0 until sizeY) {
+ for (x in 0 until sizeX) {
+ val offsetY = y * sizeX + x
+ val offsetU = ((y shr 1) * sizeX + (x shr 1) * 2)
+ val offsetV = ((y shr 1) * sizeX + (x shr 1) * 2)
+ output[x, y] = yuvToRGBA4(
+ inputArray[startY + offsetY],
+ inputArray[startU + offsetU],
+ inputArray[startV + offsetV]
+ )
+ }
+ }
+ }
+
+ YuvFormat.YV12 -> {
+ /* According to https://developer.android.com/reference/kotlin/android/graphics/ImageFormat#yv12,
+ * strideX and strideUV should be aligned to 16 byte boundaries. If we do this, we
+ * won't get the same results as RenderScript.
+ *
+ * We may want to test & require that sizeX is a multiple of 16/32.
+ */
+ val strideX = roundUpTo16(sizeX) // sizeX //
+ val strideUV = roundUpTo16(strideX / 2) // strideX / 2 //
+ val startY = 0
+ val startU = strideX * sizeY
+ val startV = startU + strideUV * sizeY / 2
+
+ for (y in 0 until sizeY) {
+ for (x in 0 until sizeX) {
+ val offsetY = y * sizeX + x
+ val offsetUV = (y shr 1) * strideUV + (x shr 1)
+ output[x, y] = yuvToRGBA4(
+ inputArray[startY + offsetY],
+ inputArray[startU + offsetUV],
+ inputArray[startV + offsetUV],
+ )
+ }
+ }
+ }
+ else -> throw IllegalArgumentException("Unknown YUV format $format")
+ }
+
+ return outputArray
+}
+
+@ExperimentalUnsignedTypes
+private fun yuvToRGBA4(y: UByte, u: UByte, v: UByte): UByteArray {
+ val intY = y.toInt() - 16
+ val intU = u.toInt() - 128
+ val intV = v.toInt() - 128
+ val p = intArrayOf(
+ intY * 298 + intV * 409 + 128 shr 8,
+ intY * 298 - intU * 100 - intV * 208 + 128 shr 8,
+ intY * 298 + intU * 516 + 128 shr 8,
+ 255
+ )
+ return UByteArray(4) { p[it].clampToUByte() }
+}
+
+/* To be used if we support Float
+private fun yuvToRGBA_f4(y: UByte, u: UByte, v: UByte): UByteArray {
+ val yuv_U_values = floatArrayOf(0f, -0.392f * 0.003921569f, 2.02f * 0.003921569f, 0f)
+ val yuv_V_values = floatArrayOf(1.603f * 0.003921569f, -0.815f * 0.003921569f, 0f, 0f)
+
+ var color = FloatArray(4) {y.toFloat() * 0.003921569f}
+ val fU = FloatArray(4) {u.toFloat() - 128f}
+ val fV = FloatArray(4) {v.toFloat() - 128f}
+
+ color += fU * yuv_U_values;
+ color += fV * yuv_V_values;
+ //color = clamp(color, 0.f, 1.f);
+ return UByteArray(4) { unitFloatClampedToUByte(color[it]) }
+}
+*/
diff --git a/toolkit/test/TimingTracker.kt b/toolkit/test/TimingTracker.kt
new file mode 100644
index 0000000..81e90f2
--- /dev/null
+++ b/toolkit/test/TimingTracker.kt
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+class TimingTracker(
+ private val numberOfIterations: Int = 1,
+ private var numberOfIterationsToIgnore: Int = 0
+) {
+ init {
+ require(numberOfIterations > numberOfIterationsToIgnore)
+ }
+ private val timings = mutableMapOf<String, IntArray>()
+ private var currentIteration: Int = 0
+ fun nextIteration() {
+ currentIteration++
+ }
+ fun <T> measure(name: String, workToTime: () -> T): T {
+ val start = System.nanoTime()
+ val t = workToTime()
+ if (currentIteration >= numberOfIterationsToIgnore) {
+ val end = System.nanoTime()
+ val deltaInMicroseconds: Int = ((end - start) / 1000).toInt()
+ val timing = timings.getOrPut(name) {
+ IntArray(numberOfIterations - numberOfIterationsToIgnore)
+ }
+ timing[currentIteration - numberOfIterationsToIgnore] += deltaInMicroseconds
+ }
+ return t
+ }
+ fun report(): String {
+ var minimum: Int = Int.MAX_VALUE
+ for (timing in timings.values) {
+ val m = timing.minOrNull()
+ if (m != null && m < minimum) minimum = m
+ }
+
+ println(timings.map { (name, timing) -> name + ": " + timing.minOrNull() }.joinToString(separator = "\n"))
+
+ return (timings.map { (name, timing) -> name + ": " + timing.joinToString() }.joinToString() + "\n\n" +
+ timings.map { (name, timing) -> name + ": " + timing.joinToString { "%.2f".format(it.toFloat() / minimum) } }.joinToString() + "\n\n" +
+ timings.map { (name, timing) -> name + ": " + timing.minOrNull() }.joinToString())
+ }
+}
+
diff --git a/toolkit/test/res/drawable-nodpi/img800x450a.jpg b/toolkit/test/res/drawable-nodpi/img800x450a.jpg
new file mode 100644
index 0000000..6d5b623
--- /dev/null
+++ b/toolkit/test/res/drawable-nodpi/img800x450a.jpg
Binary files differ
diff --git a/toolkit/test/res/drawable-nodpi/img800x450b.jpg b/toolkit/test/res/drawable-nodpi/img800x450b.jpg
new file mode 100644
index 0000000..2013e07
--- /dev/null
+++ b/toolkit/test/res/drawable-nodpi/img800x450b.jpg
Binary files differ
diff --git a/toolkit/x86.cpp b/toolkit/x86.cpp
new file mode 100644
index 0000000..d25c3d7
--- /dev/null
+++ b/toolkit/x86.cpp
@@ -0,0 +1,1323 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#include <x86intrin.h>
+
+namespace android {
+namespace renderscript {
+
+/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
+static inline __m128i cvtepu8_epi32(__m128i x) {
+#if defined(__SSE4_1__)
+ return _mm_cvtepu8_epi32(x);
+#elif defined(__SSSE3__)
+ const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
+ x = _mm_shuffle_epi8(x, M8to32);
+ return x;
+#else
+# error "Require at least SSSE3"
+#endif
+}
+
+static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
+#if defined(__SSE4_1__)
+ return _mm_packus_epi32(lo, hi);
+#elif defined(__SSSE3__)
+ const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
+ const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
+ const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
+ const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
+ lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
+ lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
+ hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
+ hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
+ return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
+ _mm_shuffle_epi8(hi, M32to16H));
+#else
+# error "Require at least SSSE3"
+#endif
+}
+
+static inline __m128i mullo_epi32(__m128i x, __m128i y) {
+#if defined(__SSE4_1__)
+ return _mm_mullo_epi32(x, y);
+#elif defined(__SSSE3__)
+ const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
+ __m128i even = _mm_mul_epu32(x, y);
+ __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
+ _mm_srli_si128(y, 4));
+ even = _mm_and_si128(even, Meven);
+ odd = _mm_and_si128(odd, Meven);
+ return _mm_or_si128(even, _mm_slli_si128(odd, 4));
+#else
+# error "Require at least SSSE3"
+#endif
+}
+
+/* 'mask' must packed 8-bit of 0x00 or 0xff */
+static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
+#if defined(__SSE4_1__)
+ return _mm_blendv_epi8(x, y, mask);
+#elif defined(__SSSE3__)
+ return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
+#else
+# error "Require at least SSSE3"
+#endif
+}
+
+extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
+ const void *y1, const void *y2,
+ const short *coef, uint32_t count) {
+ __m128i x;
+ __m128i c0, c2, c4, c6, c8;
+ __m128i r0, r1, r2;
+ __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
+ __m128i o0, o1;
+ uint32_t i;
+
+ x = _mm_loadl_epi64((const __m128i *)(coef+0));
+ c0 = _mm_shuffle_epi32(x, 0x00);
+ c2 = _mm_shuffle_epi32(x, 0x55);
+ x = _mm_loadl_epi64((const __m128i *)(coef+4));
+ c4 = _mm_shuffle_epi32(x, 0x00);
+ c6 = _mm_shuffle_epi32(x, 0x55);
+ x = _mm_loadl_epi64((const __m128i *)(coef+8));
+ c8 = _mm_shuffle_epi32(x, 0x00);
+
+ for (i = 0; i < count; ++i) {
+
+ p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
+ p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
+ p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
+ p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
+ p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
+ p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
+ p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
+ p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
+ p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
+ p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
+ p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
+ p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
+
+ o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
+ o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
+
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
+
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
+
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
+
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
+
+ o0 = _mm_srai_epi32(o0, 8);
+ o1 = _mm_srai_epi32(o1, 8);
+
+ o0 = packus_epi32(o0, o1);
+ o0 = _mm_packus_epi16(o0, o0);
+ _mm_storel_epi64((__m128i *)dst, o0);
+
+ y0 = (const char *)y0 + 8;
+ y1 = (const char *)y1 + 8;
+ y2 = (const char *)y2 + 8;
+ dst = (char *)dst + 8;
+ }
+}
+
+void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
+ const short *coef, uint32_t count) {
+ const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+ 14, 10, 6, 2,
+ 13, 9, 5, 1,
+ 12, 8, 4, 0);
+
+ const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
+ const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
+ __m128i c0, c1, c2, c3;
+ __m128i i4, o4;
+ __m128i xy, zw;
+ __m128i x2, y2, z2, w2;
+ uint32_t i;
+
+ c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
+ c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
+ c0 = _mm_unpacklo_epi16(c0, c1);
+
+ c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
+ c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
+ c2 = _mm_unpacklo_epi16(c2, c3);
+
+ for (i = 0; i < count; ++i) {
+ i4 = _mm_load_si128((const __m128i *)src);
+ xy = _mm_shuffle_epi8(i4, Mxy);
+ zw = _mm_shuffle_epi8(i4, Mzw);
+
+ x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
+ y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
+ z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
+ w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
+
+ x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
+ y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
+ z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
+ w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
+
+ x2 = _mm_srai_epi32(x2, 8);
+ y2 = _mm_srai_epi32(y2, 8);
+ z2 = _mm_srai_epi32(z2, 8);
+ w2 = _mm_srai_epi32(w2, 8);
+
+ x2 = packus_epi32(x2, y2);
+ z2 = packus_epi32(z2, w2);
+ o4 = _mm_packus_epi16(x2, z2);
+
+ o4 = _mm_shuffle_epi8(o4, T4x4);
+ _mm_storeu_si128((__m128i *)dst, o4);
+
+ src = (const char *)src + 16;
+ dst = (char *)dst + 16;
+ }
+}
+
+void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
+ const short *coef, uint32_t count) {
+ const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+ 14, 10, 6, 2,
+ 13, 9, 5, 1,
+ 12, 8, 4, 0);
+
+ const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
+ const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
+
+ __m128i c0, c1, c2, c3;
+ __m128i i4, o4;
+ __m128i xy, zw;
+ __m128i x2, y2, z2, w2;
+ uint32_t i;
+
+ c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
+ c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
+ c0 = _mm_unpacklo_epi16(c0, c1);
+
+ c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
+ c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
+ c2 = _mm_unpacklo_epi16(c2, c3);
+
+ for (i = 0; i < count; ++i) {
+ i4 = _mm_loadu_si128((const __m128i *)src);
+ xy = _mm_shuffle_epi8(i4, Mxy);
+ zw = _mm_shuffle_epi8(i4, Mzw);
+
+ x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
+ y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
+ z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
+
+ x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
+ y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
+ z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
+
+ x2 = _mm_srai_epi32(x2, 8);
+ y2 = _mm_srai_epi32(y2, 8);
+ z2 = _mm_srai_epi32(z2, 8);
+ w2 = _mm_srli_epi32(zw, 16);
+
+ x2 = packus_epi32(x2, y2);
+ z2 = packus_epi32(z2, w2);
+ o4 = _mm_packus_epi16(x2, z2);
+
+ o4 = _mm_shuffle_epi8(o4, T4x4);
+ _mm_storeu_si128((__m128i *)dst, o4);
+
+ src = (const char *)src + 16;
+ dst = (char *)dst + 16;
+ }
+}
+
+void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
+ const short *coef, uint32_t count) {
+ const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+ 14, 10, 6, 2,
+ 13, 9, 5, 1,
+ 12, 8, 4, 0);
+ const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
+ const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
+ __m128i c0, c1, c2, c3;
+ __m128i i4, o4;
+ __m128i xy, zw;
+ __m128i x2, y2, z2, w2;
+ uint32_t i;
+
+ c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
+ c0 = _mm_shufflelo_epi16(c0, 0);
+ c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
+ c1 = _mm_shufflelo_epi16(c1, 0);
+ c0 = _mm_unpacklo_epi16(c0, c1);
+
+ c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
+ c2 = _mm_shufflelo_epi16(c2, 0);
+ c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
+ c3 = _mm_shufflelo_epi16(c3, 0);
+ c2 = _mm_unpacklo_epi16(c2, c3);
+
+ for (i = 0; i < count; ++i) {
+ i4 = _mm_loadu_si128((const __m128i *)src);
+
+ xy = _mm_shuffle_epi8(i4, Mxy);
+ zw = _mm_shuffle_epi8(i4, Mzw);
+
+ x2 = _mm_madd_epi16(xy, c0);
+ x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
+
+ x2 = _mm_srai_epi32(x2, 8);
+ y2 = x2;
+ z2 = x2;
+ w2 = _mm_srli_epi32(zw, 16);
+
+ x2 = packus_epi32(x2, y2);
+ z2 = packus_epi32(z2, w2);
+ o4 = _mm_packus_epi16(x2, z2);
+
+ o4 = _mm_shuffle_epi8(o4, T4x4);
+ _mm_storeu_si128((__m128i *)dst, o4);
+
+ src = (const char *)src + 16;
+ dst = (char *)dst + 16;
+ }
+}
+
+void rsdIntrinsicBlurVFU4_K(void *dst,
+ const void *pin, int stride, const void *gptr,
+ int rct, int x1, int x2) {
+ const char *pi;
+ __m128i pi0, pi1;
+ __m128 pf0, pf1;
+ __m128 bp0, bp1;
+ __m128 x;
+ int r;
+
+ for (; x1 < x2; x1 += 2) {
+ pi = (const char *)pin + (x1 << 2);
+ bp0 = _mm_setzero_ps();
+ bp1 = _mm_setzero_ps();
+
+ for (r = 0; r < rct; ++r) {
+ x = _mm_load_ss((const float *)gptr + r);
+ x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
+
+ pi0 = _mm_cvtsi32_si128(*(const int *)pi);
+ pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
+
+ pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
+ pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
+
+ bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
+ bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
+
+ pi += stride;
+ }
+
+ _mm_storeu_ps((float *)dst, bp0);
+ _mm_storeu_ps((float *)dst + 4, bp1);
+ dst = (char *)dst + 32;
+ }
+}
+
+void rsdIntrinsicBlurHFU4_K(void *dst,
+ const void *pin, const void *gptr,
+ int rct, int x1, int x2) {
+ const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
+ const float *pi;
+ __m128 pf, x, y;
+ __m128i o;
+ int r;
+
+ for (; x1 < x2; ++x1) {
+ /* rct is define as 2*r+1 by the caller */
+ x = _mm_load_ss((const float *)gptr);
+ x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
+
+ pi = (const float *)pin + (x1 << 2);
+ pf = _mm_mul_ps(x, _mm_load_ps(pi));
+
+ for (r = 1; r < rct; r += 2) {
+ x = _mm_load_ss((const float *)gptr + r);
+ y = _mm_load_ss((const float *)gptr + r + 1);
+ x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
+ y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
+
+ pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
+ pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
+ }
+
+ o = _mm_cvtps_epi32(pf);
+ *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
+ dst = (char *)dst + 4;
+ }
+}
+
+void rsdIntrinsicBlurHFU1_K(void *dst,
+ const void *pin, const void *gptr,
+ int rct, int x1, int x2) {
+ const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
+ const float *pi;
+ __m128 pf, g0, g1, g2, g3, gx, p0, p1;
+ __m128i o;
+ int r;
+
+ for (; x1 < x2; x1+=4) {
+ g0 = _mm_load_ss((const float *)gptr);
+ g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
+
+ pi = (const float *)pin + x1;
+ pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
+
+ for (r = 1; r < rct; r += 4) {
+ gx = _mm_loadu_ps((const float *)gptr + r);
+ p0 = _mm_loadu_ps(pi + r);
+ p1 = _mm_loadu_ps(pi + r + 4);
+
+ g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
+ pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
+ g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
+ pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
+ g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
+ pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
+ g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
+ pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
+ }
+
+ o = _mm_cvtps_epi32(pf);
+ *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
+ dst = (char *)dst + 4;
+ }
+}
+
+void rsdIntrinsicYuv_K(void *dst,
+ const unsigned char *pY, const unsigned char *pUV,
+ uint32_t count, const short *param) {
+ __m128i biasY, biasUV;
+ __m128i c0, c1, c2, c3, c4;
+
+ biasY = _mm_set1_epi32(param[8]); /* 16 */
+ biasUV = _mm_set1_epi32(param[16]); /* 128 */
+
+ c0 = _mm_set1_epi32(param[0]); /* 298 */
+ c1 = _mm_set1_epi32(param[1]); /* 409 */
+ c2 = _mm_set1_epi32(param[2]); /* -100 */
+ c3 = _mm_set1_epi32(param[3]); /* 516 */
+ c4 = _mm_set1_epi32(param[4]); /* -208 */
+
+ __m128i Y, UV, U, V, R, G, B, A;
+
+ A = _mm_set1_epi32(255);
+ uint32_t i;
+
+ for (i = 0; i < (count << 1); ++i) {
+ Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
+ UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
+
+ Y = _mm_sub_epi32(Y, biasY);
+ UV = _mm_sub_epi32(UV, biasUV);
+
+ U = _mm_shuffle_epi32(UV, 0xf5);
+ V = _mm_shuffle_epi32(UV, 0xa0);
+
+ Y = mullo_epi32(Y, c0);
+
+ R = _mm_add_epi32(Y, mullo_epi32(V, c1));
+ R = _mm_add_epi32(R, biasUV);
+ R = _mm_srai_epi32(R, 8);
+
+ G = _mm_add_epi32(Y, mullo_epi32(U, c2));
+ G = _mm_add_epi32(G, mullo_epi32(V, c4));
+ G = _mm_add_epi32(G, biasUV);
+ G = _mm_srai_epi32(G, 8);
+
+ B = _mm_add_epi32(Y, mullo_epi32(U, c3));
+ B = _mm_add_epi32(B, biasUV);
+ B = _mm_srai_epi32(B, 8);
+
+ __m128i y1, y2, y3, y4;
+
+ y1 = packus_epi32(R, G);
+ y2 = packus_epi32(B, A);
+ y3 = _mm_packus_epi16(y1, y2);
+ const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+ 14, 10, 6, 2,
+ 13, 9, 5, 1,
+ 12, 8, 4, 0);
+ y4 = _mm_shuffle_epi8(y3, T4x4);
+ _mm_storeu_si128((__m128i *)dst, y4);
+ pY += 4;
+ pUV += 4;
+ dst = (__m128i *)dst + 1;
+ }
+}
+
+void rsdIntrinsicYuvR_K(void *dst,
+ const unsigned char *pY, const unsigned char *pUV,
+ uint32_t count, const short *param) {
+ __m128i biasY, biasUV;
+ __m128i c0, c1, c2, c3, c4;
+
+ biasY = _mm_set1_epi32(param[8]); /* 16 */
+ biasUV = _mm_set1_epi32(param[16]); /* 128 */
+
+ c0 = _mm_set1_epi32(param[0]); /* 298 */
+ c1 = _mm_set1_epi32(param[1]); /* 409 */
+ c2 = _mm_set1_epi32(param[2]); /* -100 */
+ c3 = _mm_set1_epi32(param[3]); /* 516 */
+ c4 = _mm_set1_epi32(param[4]); /* -208 */
+
+ __m128i Y, UV, U, V, R, G, B, A;
+
+ A = _mm_set1_epi32(255);
+ uint32_t i;
+
+ for (i = 0; i < (count << 1); ++i) {
+ Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
+ UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
+
+ Y = _mm_sub_epi32(Y, biasY);
+ UV = _mm_sub_epi32(UV, biasUV);
+
+ V = _mm_shuffle_epi32(UV, 0xf5);
+ U = _mm_shuffle_epi32(UV, 0xa0);
+
+ Y = mullo_epi32(Y, c0);
+
+ R = _mm_add_epi32(Y, mullo_epi32(V, c1));
+ R = _mm_add_epi32(R, biasUV);
+ R = _mm_srai_epi32(R, 8);
+
+ G = _mm_add_epi32(Y, mullo_epi32(U, c2));
+ G = _mm_add_epi32(G, mullo_epi32(V, c4));
+ G = _mm_add_epi32(G, biasUV);
+ G = _mm_srai_epi32(G, 8);
+
+ B = _mm_add_epi32(Y, mullo_epi32(U, c3));
+ B = _mm_add_epi32(B, biasUV);
+ B = _mm_srai_epi32(B, 8);
+
+ __m128i y1, y2, y3, y4;
+
+ y1 = packus_epi32(R, G);
+ y2 = packus_epi32(B, A);
+ y3 = _mm_packus_epi16(y1, y2);
+ const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+ 14, 10, 6, 2,
+ 13, 9, 5, 1,
+ 12, 8, 4, 0);
+ y4 = _mm_shuffle_epi8(y3, T4x4);
+ _mm_storeu_si128((__m128i *)dst, y4);
+ pY += 4;
+ pUV += 4;
+ dst = (__m128i *)dst + 1;
+ }
+}
+
+void rsdIntrinsicYuv2_K(void *dst,
+ const unsigned char *pY, const unsigned char *pU,
+ const unsigned char *pV, uint32_t count, const short *param) {
+ __m128i biasY, biasUV;
+ __m128i c0, c1, c2, c3, c4;
+
+ biasY = _mm_set1_epi32(param[8]); /* 16 */
+ biasUV = _mm_set1_epi32(param[16]); /* 128 */
+
+ c0 = _mm_set1_epi32(param[0]); /* 298 */
+ c1 = _mm_set1_epi32(param[1]); /* 409 */
+ c2 = _mm_set1_epi32(param[2]); /* -100 */
+ c3 = _mm_set1_epi32(param[3]); /* 516 */
+ c4 = _mm_set1_epi32(param[4]); /* -208 */
+
+ __m128i Y, U, V, R, G, B, A;
+
+ A = _mm_set1_epi32(255);
+ uint32_t i;
+
+ for (i = 0; i < (count << 1); ++i) {
+ Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
+ U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
+ V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
+
+ Y = _mm_sub_epi32(Y, biasY);
+ U = _mm_sub_epi32(U, biasUV);
+ V = _mm_sub_epi32(V, biasUV);
+
+ Y = mullo_epi32(Y, c0);
+
+ R = _mm_add_epi32(Y, mullo_epi32(V, c1));
+ R = _mm_add_epi32(R, biasUV);
+ R = _mm_srai_epi32(R, 8);
+
+ G = _mm_add_epi32(Y, mullo_epi32(U, c2));
+ G = _mm_add_epi32(G, mullo_epi32(V, c4));
+ G = _mm_add_epi32(G, biasUV);
+ G = _mm_srai_epi32(G, 8);
+
+ B = _mm_add_epi32(Y, mullo_epi32(U, c3));
+ B = _mm_add_epi32(B, biasUV);
+ B = _mm_srai_epi32(B, 8);
+
+ __m128i y1, y2, y3, y4;
+
+ y1 = packus_epi32(R, G);
+ y2 = packus_epi32(B, A);
+ y3 = _mm_packus_epi16(y1, y2);
+ const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+ 14, 10, 6, 2,
+ 13, 9, 5, 1,
+ 12, 8, 4, 0);
+ y4 = _mm_shuffle_epi8(y3, T4x4);
+ _mm_storeu_si128((__m128i *)dst, y4);
+ pY += 4;
+ pU += 4;
+ pV += 4;
+ dst = (__m128i *)dst + 1;
+ }
+}
+
+extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
+ const void *y1, const void *y2,
+ const void *y3, const void *y4,
+ const short *coef, uint32_t count) {
+ __m128i x;
+ __m128i c0, c2, c4, c6, c8, c10, c12;
+ __m128i c14, c16, c18, c20, c22, c24;
+ __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
+ __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+ __m128i p8, p9, p10, p11, p12, p13, p14, p15;
+ __m128i p16, p17, p18, p19, p20, p21, p22, p23;
+ __m128i p24, p25, p26, p27, p28, p29, p30, p31;
+ __m128i p32, p33, p34, p35, p36, p37, p38, p39;
+ __m128i o0, o1, o2, o3;
+ uint32_t i;
+
+ x = _mm_loadl_epi64((const __m128i *)(coef+0));
+ c0 = _mm_shuffle_epi32(x, 0x00);
+ c2 = _mm_shuffle_epi32(x, 0x55);
+
+ x = _mm_loadl_epi64((const __m128i *)(coef+4));
+ c4 = _mm_shuffle_epi32(x, 0x00);
+ c6 = _mm_shuffle_epi32(x, 0x55);
+
+ x = _mm_loadl_epi64((const __m128i *)(coef+8));
+ c8 = _mm_shuffle_epi32(x, 0x00);
+ c10 = _mm_shuffle_epi32(x, 0x55);
+
+ x = _mm_loadl_epi64((const __m128i *)(coef+12));
+ c12 = _mm_shuffle_epi32(x, 0x00);
+ c14 = _mm_shuffle_epi32(x, 0x55);
+
+ x = _mm_loadl_epi64((const __m128i *)(coef+16));
+ c16 = _mm_shuffle_epi32(x, 0x00);
+ c18 = _mm_shuffle_epi32(x, 0x55);
+
+ x = _mm_loadl_epi64((const __m128i *)(coef+20));
+ c20 = _mm_shuffle_epi32(x, 0x00);
+ c22 = _mm_shuffle_epi32(x, 0x55);
+
+ x = _mm_loadl_epi64((const __m128i *)(coef+24));
+ c24 = _mm_shuffle_epi32(x, 0x00);
+
+ for (i = 0; i < count; ++i) {
+
+ p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
+ p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
+ p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
+ p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
+ p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
+ p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
+ p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
+ p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
+
+ p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
+ p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
+ p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
+ p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
+ p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
+ p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
+ p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
+ p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
+
+ p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
+ p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
+ p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
+ p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
+ p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
+ p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
+ p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
+ p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
+
+ p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
+ p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
+ p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
+ p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
+ p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
+ p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
+ p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
+ p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
+
+ p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
+ p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
+ p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
+ p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
+ p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
+ p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
+ p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
+ p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
+
+ o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0);
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
+ o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
+ o0 = _mm_srai_epi32(o0, 8);
+
+ o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0);
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
+ o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
+ o1 = _mm_srai_epi32(o1, 8);
+
+ o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0);
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
+ o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
+ o2 = _mm_srai_epi32(o2, 8);
+
+ o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0);
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
+ o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
+ o3 = _mm_srai_epi32(o3, 8);
+
+ o0 = packus_epi32(o0, o1);
+ o2 = packus_epi32(o2, o3);
+ o0 = _mm_packus_epi16(o0, o2);
+ _mm_storeu_si128((__m128i *)dst, o0);
+
+ y0 = (const char *)y0 + 16;
+ y1 = (const char *)y1 + 16;
+ y2 = (const char *)y2 + 16;
+ y3 = (const char *)y3 + 16;
+ y4 = (const char *)y4 + 16;
+ dst = (char *)dst + 16;
+ }
+}
+
+void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
+ __m128i all1s, ina, ins;
+ __m128i in0, in1, out0, out1;
+ __m128i t0, t1, t2, t3;
+ uint32_t i;
+
+ all1s = _mm_set1_epi16(255);
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+ t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
+ t0 = _mm_srli_epi16(t0, 8);
+ t0 = _mm_add_epi16(t0, ins);
+
+ ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+ t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
+ t1 = _mm_srli_epi16(t1, 8);
+ t1 = _mm_add_epi16(t1, ins);
+
+ ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+ t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
+ t2 = _mm_srli_epi16(t2, 8);
+ t2 = _mm_add_epi16(t2, ins);
+
+ ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+ t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
+ t3 = _mm_srli_epi16(t3, 8);
+ t3 = _mm_add_epi16(t3, ins);
+
+ t0 = _mm_packus_epi16(t0, t1);
+ t2 = _mm_packus_epi16(t2, t3);
+ _mm_storeu_si128((__m128i *)dst, t0);
+ _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
+ __m128i all1s, outa, outs;
+ __m128i in0, in1, out0, out1;
+ __m128i t0, t1, t2, t3;
+ uint32_t i;
+
+ all1s = _mm_set1_epi16(255);
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+
+ outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+ t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
+ t0 = _mm_srli_epi16(t0, 8);
+ t0 = _mm_add_epi16(t0, outs);
+
+ outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+ t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
+ t1 = _mm_srli_epi16(t1, 8);
+ t1 = _mm_add_epi16(t1, outs);
+
+ outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+ t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
+ t2 = _mm_srli_epi16(t2, 8);
+ t2 = _mm_add_epi16(t2, outs);
+
+ outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+ t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
+ t3 = _mm_srli_epi16(t3, 8);
+ t3 = _mm_add_epi16(t3, outs);
+
+ t0 = _mm_packus_epi16(t0, t1);
+ t2 = _mm_packus_epi16(t2, t3);
+ _mm_storeu_si128((__m128i *)dst, t0);
+ _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
+ __m128i outa;
+ __m128i in0, in1, out0, out1;
+ __m128i t0, t1, t2, t3;
+ uint32_t i;
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outa, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+ t0 = _mm_mullo_epi16(t0, outa);
+ t0 = _mm_srli_epi16(t0, 8);
+
+ outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outa, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+ t1 = _mm_mullo_epi16(t1, outa);
+ t1 = _mm_srli_epi16(t1, 8);
+
+ outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outa, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+ t2 = _mm_mullo_epi16(t2, outa);
+ t2 = _mm_srli_epi16(t2, 8);
+
+ outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outa, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+ t3 = _mm_mullo_epi16(t3, outa);
+ t3 = _mm_srli_epi16(t3, 8);
+
+ t0 = _mm_packus_epi16(t0, t1);
+ t2 = _mm_packus_epi16(t2, t3);
+ _mm_storeu_si128((__m128i *)dst, t0);
+ _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
+ __m128i ina;
+ __m128i in0, in1, out0, out1;
+ __m128i t0, t1, t2, t3;
+ uint32_t i;
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ina, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+ t0 = _mm_mullo_epi16(t0, ina);
+ t0 = _mm_srli_epi16(t0, 8);
+
+ ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ina, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+ t1 = _mm_mullo_epi16(t1, ina);
+ t1 = _mm_srli_epi16(t1, 8);
+
+ ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ina, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+ t2 = _mm_mullo_epi16(t2, ina);
+ t2 = _mm_srli_epi16(t2, 8);
+
+ ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ina, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+ t3 = _mm_mullo_epi16(t3, ina);
+ t3 = _mm_srli_epi16(t3, 8);
+
+ t0 = _mm_packus_epi16(t0, t1);
+ t2 = _mm_packus_epi16(t2, t3);
+ _mm_storeu_si128((__m128i *)dst, t0);
+ _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
+ __m128i all1s, outa;
+ __m128i in0, in1, out0, out1;
+ __m128i t0, t1, t2, t3;
+ uint32_t i;
+
+ all1s = _mm_set1_epi16(255);
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outa, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+ t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
+ t0 = _mm_srli_epi16(t0, 8);
+
+ outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outa, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+ t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
+ t1 = _mm_srli_epi16(t1, 8);
+
+ outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outa, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+ t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
+ t2 = _mm_srli_epi16(t2, 8);
+
+ outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outa, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+ t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
+ t3 = _mm_srli_epi16(t3, 8);
+
+ t0 = _mm_packus_epi16(t0, t1);
+ t2 = _mm_packus_epi16(t2, t3);
+ _mm_storeu_si128((__m128i *)dst, t0);
+ _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
+ __m128i all1s, ina;
+ __m128i in0, in1, out0, out1;
+ __m128i t0, t1, t2, t3;
+ uint32_t i;
+
+ all1s = _mm_set1_epi16(255);
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ina, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+ t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
+ t0 = _mm_srli_epi16(t0, 8);
+
+ ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ina, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+ t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
+ t1 = _mm_srli_epi16(t1, 8);
+
+ ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ina, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+ t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
+ t2 = _mm_srli_epi16(t2, 8);
+
+ ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ina, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+ t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
+ t3 = _mm_srli_epi16(t3, 8);
+
+ t0 = _mm_packus_epi16(t0, t1);
+ t2 = _mm_packus_epi16(t2, t3);
+ _mm_storeu_si128((__m128i *)dst, t0);
+ _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
+ const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
+ __m128i all1s, ina, outa, ins, outs;
+ __m128i in0, in1, out0, out1;
+ __m128i t0, t1, t2, t3;
+ uint32_t i;
+
+ all1s = _mm_set1_epi16(255);
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t0 = _mm_sub_epi16(all1s, ina);
+ t0 = _mm_mullo_epi16(t0, outs);
+ t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
+ t0 = _mm_srli_epi16(t0, 8);
+
+ ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t1 = _mm_sub_epi16(all1s, ina);
+ t1 = _mm_mullo_epi16(t1, outs);
+ t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
+ t1 = _mm_srli_epi16(t1, 8);
+
+ ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t2 = _mm_sub_epi16(all1s, ina);
+ t2 = _mm_mullo_epi16(t2, outs);
+ t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
+ t2 = _mm_srli_epi16(t2, 8);
+
+ ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t3 = _mm_sub_epi16(all1s, ina);
+ t3 = _mm_mullo_epi16(t3, outs);
+ t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
+ t3 = _mm_srli_epi16(t3, 8);
+
+ t0 = _mm_packus_epi16(t0, t1);
+ t0 = blendv_epi8(t0, out0, M0001);
+ t2 = _mm_packus_epi16(t2, t3);
+ t2 = blendv_epi8(t2, out1, M0001);
+ _mm_storeu_si128((__m128i *)dst, t0);
+ _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
+ const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
+ __m128i all1s, ina, ins, outa, outs;
+ __m128i in0, in1, out0, out1;
+ __m128i t0, t1, t2, t3;
+ uint32_t i;
+
+ all1s = _mm_set1_epi16(255);
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t0 = _mm_sub_epi16(all1s, outa);
+ t0 = _mm_mullo_epi16(t0, ins);
+ t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
+ t0 = _mm_srli_epi16(t0, 8);
+
+ ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t1 = _mm_sub_epi16(all1s, outa);
+ t1 = _mm_mullo_epi16(t1, ins);
+ t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
+ t1 = _mm_srli_epi16(t1, 8);
+
+ ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t2 = _mm_sub_epi16(all1s, outa);
+ t2 = _mm_mullo_epi16(t2, ins);
+ t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
+ t2 = _mm_srli_epi16(t2, 8);
+
+ ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+ ina = _mm_shufflelo_epi16(ins, 0xFF);
+ ina = _mm_shufflehi_epi16(ina, 0xFF);
+ outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+ outa = _mm_shufflelo_epi16(outs, 0xFF);
+ outa = _mm_shufflehi_epi16(outa, 0xFF);
+ t3 = _mm_sub_epi16(all1s, outa);
+ t3 = _mm_mullo_epi16(t3, ins);
+ t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
+ t3 = _mm_srli_epi16(t3, 8);
+
+ t0 = _mm_packus_epi16(t0, t1);
+ t0 = blendv_epi8(t0, in0, M0001);
+ t2 = _mm_packus_epi16(t2, t3);
+ t2 = blendv_epi8(t2, in1, M0001);
+ _mm_storeu_si128((__m128i *)dst, t0);
+ _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
+ __m128i in0, in1, out0, out1;
+ uint32_t i;
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ out0 = _mm_xor_si128(out0, in0);
+ out1 = _mm_xor_si128(out1, in1);
+
+ _mm_storeu_si128((__m128i *)dst, out0);
+ _mm_storeu_si128((__m128i *)dst + 1, out1);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
+ __m128i in0, in1, out0, out1;
+ __m128i t0, t1, t2, t3;
+ uint32_t i;
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+ t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
+ t0 = _mm_srli_epi16(t0, 8);
+
+ t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+ t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
+ t1 = _mm_srli_epi16(t1, 8);
+
+ t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+ t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
+ t2 = _mm_srli_epi16(t2, 8);
+
+ t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+ t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
+ t3 = _mm_srli_epi16(t3, 8);
+
+ t0 = _mm_packus_epi16(t0, t1);
+ t2 = _mm_packus_epi16(t2, t3);
+ _mm_storeu_si128((__m128i *)dst, t0);
+ _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
+ __m128i in0, in1, out0, out1;
+ uint32_t i;
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ out0 = _mm_adds_epu8(out0, in0);
+ out1 = _mm_adds_epu8(out1, in1);
+
+ _mm_storeu_si128((__m128i *)dst, out0);
+ _mm_storeu_si128((__m128i *)dst + 1, out1);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
+ __m128i in0, in1, out0, out1;
+ uint32_t i;
+
+ for (i = 0; i < count8; ++i) {
+ in0 = _mm_loadu_si128((const __m128i *)src);
+ in1 = _mm_loadu_si128((const __m128i *)src + 1);
+ out0 = _mm_loadu_si128((const __m128i *)dst);
+ out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+ out0 = _mm_subs_epu8(out0, in0);
+ out1 = _mm_subs_epu8(out1, in1);
+
+ _mm_storeu_si128((__m128i *)dst, out0);
+ _mm_storeu_si128((__m128i *)dst + 1, out1);
+
+ src = (const __m128i *)src + 2;
+ dst = (__m128i *)dst + 2;
+ }
+}
+
+} // namespace android
+} // namespace renderscript