Merge "Improve the documentation of the RenderScript API."
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index e7e7e59..e079678 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -1,4 +1,3 @@
-
LOCAL_PATH:=$(call my-dir)
rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable \
@@ -19,26 +18,26 @@
LOCAL_MODULE_TARGET_ARCH := arm mips mips64 x86 x86_64 arm64
LOCAL_SRC_FILES:= \
- rsCpuCore.cpp \
- rsCpuExecutable.cpp \
- rsCpuScript.cpp \
- rsCpuRuntimeMath.cpp \
- rsCpuRuntimeMathFuncs.cpp \
- rsCpuRuntimeStubs.cpp \
- rsCpuScriptGroup.cpp \
- rsCpuScriptGroup2.cpp \
- rsCpuIntrinsic.cpp \
- rsCpuIntrinsic3DLUT.cpp \
- rsCpuIntrinsicBLAS.cpp \
- rsCpuIntrinsicBlend.cpp \
- rsCpuIntrinsicBlur.cpp \
- rsCpuIntrinsicColorMatrix.cpp \
- rsCpuIntrinsicConvolve3x3.cpp \
- rsCpuIntrinsicConvolve5x5.cpp \
- rsCpuIntrinsicHistogram.cpp \
- rsCpuIntrinsicResize.cpp \
- rsCpuIntrinsicLUT.cpp \
- rsCpuIntrinsicYuvToRGB.cpp
+ rsCpuCore.cpp \
+ rsCpuExecutable.cpp \
+ rsCpuScript.cpp \
+ rsCpuRuntimeMath.cpp \
+ rsCpuRuntimeMathFuncs.cpp \
+ rsCpuRuntimeStubs.cpp \
+ rsCpuScriptGroup.cpp \
+ rsCpuScriptGroup2.cpp \
+ rsCpuIntrinsic.cpp \
+ rsCpuIntrinsic3DLUT.cpp \
+ rsCpuIntrinsicBLAS.cpp \
+ rsCpuIntrinsicBlend.cpp \
+ rsCpuIntrinsicBlur.cpp \
+ rsCpuIntrinsicColorMatrix.cpp \
+ rsCpuIntrinsicConvolve3x3.cpp \
+ rsCpuIntrinsicConvolve5x5.cpp \
+ rsCpuIntrinsicHistogram.cpp \
+ rsCpuIntrinsicResize.cpp \
+ rsCpuIntrinsicLUT.cpp \
+ rsCpuIntrinsicYuvToRGB.cpp
LOCAL_CFLAGS_arm64 += -DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS -DARCH_ARM64_HAVE_NEON
diff --git a/cpu_ref/rsCpuIntrinsicBLAS.cpp b/cpu_ref/rsCpuIntrinsicBLAS.cpp
index 486eed8..8aa9ddc 100644
--- a/cpu_ref/rsCpuIntrinsicBLAS.cpp
+++ b/cpu_ref/rsCpuIntrinsicBLAS.cpp
@@ -42,6 +42,17 @@
protected:
+ uint8_t a_offset = 0;
+ uint8_t b_offset = 0;
+ uint8_t c_offset = 0;
+
+ static void kernelBGEMM(size_t m, size_t n, size_t k,
+ const uint8_t* a, uint32_t a_offset, size_t lda,
+ const uint8_t* b, uint32_t b_offset, size_t ldb,
+ uint8_t* c, uint32_t c_offset, size_t ldc,
+ uint32_t c_mult_int);
+
+
};
@@ -624,6 +635,17 @@
B, ldb, call->beta.d, C, ldc);
break;
+
+ case (RsBlas_bgemm):
+ initABC(ain, sizeof(uint8_t), &A, &B, &C, &lda, &ldb, &ldc);
+ kernelBGEMM(call->M, call->N, call->K,
+ (const uint8_t*)A, call->a_offset, lda,
+ (const uint8_t*)B, call->b_offset, ldb,
+ (uint8_t*)C, call->c_offset, ldc,
+ call->c_mult_int);
+
+ break;
+
default:
ALOGE("unimplemented\n");
}
@@ -631,6 +653,45 @@
}
+void RsdCpuScriptIntrinsicBLAS::kernelBGEMM(size_t m, size_t n, size_t k,
+ const uint8_t* a, uint32_t a_offset, size_t lda,
+ const uint8_t* b, uint32_t b_offset, size_t ldb,
+ uint8_t* c, uint32_t c_offset, size_t ldc,
+ uint32_t c_mult_int) {
+
+ const int c_shift = 23;
+ size_t i = 0, j = 0, l = 0;
+ for (j = 0; j < n; j++) {
+ for (i = 0; i < m; i++) {
+ int32_t total = 0;
+ for (l = 0; l < k; l++) {
+ const int a_index = ((i * lda) + l);
+ const uint8_t a_as_byte = a[a_index];
+ const int32_t a_as_int = (((int32_t)(a_as_byte)) - a_offset);
+ const int b_index = ((j * ldb) + l);
+ const uint8_t b_as_byte = b[b_index];
+ const int32_t b_as_int = (((int32_t)(b_as_byte)) - b_offset);
+ const int32_t mult_as_int = (a_as_int * b_as_int);
+ total += mult_as_int;
+ }
+ const int c_index = ((ldc * i) + j);
+ int32_t output =
+ ((((total + c_offset) * c_mult_int) + (1 << (c_shift - 1)))
+ >> c_shift);
+ if (output > 255) {
+ output = 255;
+ }
+ if (output < 0) {
+ output = 0;
+ }
+ c[c_index] = (uint8_t)(output);
+ }
+ }
+}
+
+
+
+
RsdCpuScriptIntrinsicBLAS::RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx,
const Script *s)
diff --git a/rsDefines.h b/rsDefines.h
index 4ccdeb8..8b334c6 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -277,148 +277,151 @@
enum RsBlasFunction {
RsBlas_nop = 0,
- RsBlas_sdsdot,
- RsBlas_dsdot,
- RsBlas_sdot,
- RsBlas_ddot,
- RsBlas_cdotu_sub,
- RsBlas_cdotc_sub,
- RsBlas_zdotu_sub,
- RsBlas_zdotc_sub,
- RsBlas_snrm2,
- RsBlas_sasum,
- RsBlas_dnrm2,
- RsBlas_dasum,
- RsBlas_scnrm2,
- RsBlas_scasum,
- RsBlas_dznrm2,
- RsBlas_dzasum,
- RsBlas_isamax,
- RsBlas_idamax,
- RsBlas_icamax,
- RsBlas_izamax,
- RsBlas_sswap,
- RsBlas_scopy,
- RsBlas_saxpy,
- RsBlas_dswap,
- RsBlas_dcopy,
- RsBlas_daxpy,
- RsBlas_cswap,
- RsBlas_ccopy,
- RsBlas_caxpy,
- RsBlas_zswap,
- RsBlas_zcopy,
- RsBlas_zaxpy,
- RsBlas_srotg,
- RsBlas_srotmg,
- RsBlas_srot,
- RsBlas_srotm,
- RsBlas_drotg,
- RsBlas_drotmg,
- RsBlas_drot,
- RsBlas_drotm,
- RsBlas_sscal,
- RsBlas_dscal,
- RsBlas_cscal,
- RsBlas_zscal,
- RsBlas_csscal,
- RsBlas_zdscal,
- RsBlas_sgemv,
- RsBlas_sgbmv,
- RsBlas_strmv,
- RsBlas_stbmv,
- RsBlas_stpmv,
- RsBlas_strsv,
- RsBlas_stbsv,
- RsBlas_stpsv,
- RsBlas_dgemv,
- RsBlas_dgbmv,
- RsBlas_dtrmv,
- RsBlas_dtbmv,
- RsBlas_dtpmv,
- RsBlas_dtrsv,
- RsBlas_dtbsv,
- RsBlas_dtpsv,
- RsBlas_cgemv,
- RsBlas_cgbmv,
- RsBlas_ctrmv,
- RsBlas_ctbmv,
- RsBlas_ctpmv,
- RsBlas_ctrsv,
- RsBlas_ctbsv,
- RsBlas_ctpsv,
- RsBlas_zgemv,
- RsBlas_zgbmv,
- RsBlas_ztrmv,
- RsBlas_ztbmv,
- RsBlas_ztpmv,
- RsBlas_ztrsv,
- RsBlas_ztbsv,
- RsBlas_ztpsv,
- RsBlas_ssymv,
- RsBlas_ssbmv,
- RsBlas_sspmv,
- RsBlas_sger,
- RsBlas_ssyr,
- RsBlas_sspr,
- RsBlas_ssyr2,
- RsBlas_sspr2,
- RsBlas_dsymv,
- RsBlas_dsbmv,
- RsBlas_dspmv,
- RsBlas_dger,
- RsBlas_dsyr,
- RsBlas_dspr,
- RsBlas_dsyr2,
- RsBlas_dspr2,
- RsBlas_chemv,
- RsBlas_chbmv,
- RsBlas_chpmv,
- RsBlas_cgeru,
- RsBlas_cgerc,
- RsBlas_cher,
- RsBlas_chpr,
- RsBlas_cher2,
- RsBlas_chpr2,
- RsBlas_zhemv,
- RsBlas_zhbmv,
- RsBlas_zhpmv,
- RsBlas_zgeru,
- RsBlas_zgerc,
- RsBlas_zher,
- RsBlas_zhpr,
- RsBlas_zher2,
- RsBlas_zhpr2,
- RsBlas_sgemm,
- RsBlas_ssymm,
- RsBlas_ssyrk,
- RsBlas_ssyr2k,
- RsBlas_strmm,
- RsBlas_strsm,
- RsBlas_dgemm,
- RsBlas_dsymm,
- RsBlas_dsyrk,
- RsBlas_dsyr2k,
- RsBlas_dtrmm,
- RsBlas_dtrsm,
- RsBlas_cgemm,
- RsBlas_csymm,
- RsBlas_csyrk,
- RsBlas_csyr2k,
- RsBlas_ctrmm,
- RsBlas_ctrsm,
- RsBlas_zgemm,
- RsBlas_zsymm,
- RsBlas_zsyrk,
- RsBlas_zsyr2k,
- RsBlas_ztrmm,
- RsBlas_ztrsm,
- RsBlas_chemm,
- RsBlas_cherk,
- RsBlas_cher2k,
- RsBlas_zhemm,
- RsBlas_zherk,
- RsBlas_zher2k
+ RsBlas_sdsdot = 1,
+ RsBlas_dsdot = 2,
+ RsBlas_sdot = 3,
+ RsBlas_ddot = 4,
+ RsBlas_cdotu_sub = 5,
+ RsBlas_cdotc_sub = 6,
+ RsBlas_zdotu_sub = 7,
+ RsBlas_zdotc_sub = 8,
+ RsBlas_snrm2 = 9,
+ RsBlas_sasum = 10,
+ RsBlas_dnrm2 = 11,
+ RsBlas_dasum = 12,
+ RsBlas_scnrm2 = 13,
+ RsBlas_scasum = 14,
+ RsBlas_dznrm2 = 15,
+ RsBlas_dzasum = 16,
+ RsBlas_isamax = 17,
+ RsBlas_idamax = 18,
+ RsBlas_icamax = 19,
+ RsBlas_izamax = 20,
+ RsBlas_sswap = 21,
+ RsBlas_scopy = 22,
+ RsBlas_saxpy = 23,
+ RsBlas_dswap = 24,
+ RsBlas_dcopy = 25,
+ RsBlas_daxpy = 26,
+ RsBlas_cswap = 27,
+ RsBlas_ccopy = 28,
+ RsBlas_caxpy = 29,
+ RsBlas_zswap = 30,
+ RsBlas_zcopy = 31,
+ RsBlas_zaxpy = 32,
+ RsBlas_srotg = 33,
+ RsBlas_srotmg = 34,
+ RsBlas_srot = 35,
+ RsBlas_srotm = 36,
+ RsBlas_drotg = 37,
+ RsBlas_drotmg = 38,
+ RsBlas_drot = 39,
+ RsBlas_drotm = 40,
+ RsBlas_sscal = 41,
+ RsBlas_dscal = 42,
+ RsBlas_cscal = 43,
+ RsBlas_zscal = 44,
+ RsBlas_csscal = 45,
+ RsBlas_zdscal = 46,
+ RsBlas_sgemv = 47,
+ RsBlas_sgbmv = 48,
+ RsBlas_strmv = 49,
+ RsBlas_stbmv = 50,
+ RsBlas_stpmv = 51,
+ RsBlas_strsv = 52,
+ RsBlas_stbsv = 53,
+ RsBlas_stpsv = 54,
+ RsBlas_dgemv = 55,
+ RsBlas_dgbmv = 56,
+ RsBlas_dtrmv = 57,
+ RsBlas_dtbmv = 58,
+ RsBlas_dtpmv = 59,
+ RsBlas_dtrsv = 60,
+ RsBlas_dtbsv = 61,
+ RsBlas_dtpsv = 62,
+ RsBlas_cgemv = 63,
+ RsBlas_cgbmv = 64,
+ RsBlas_ctrmv = 65,
+ RsBlas_ctbmv = 66,
+ RsBlas_ctpmv = 67,
+ RsBlas_ctrsv = 68,
+ RsBlas_ctbsv = 69,
+ RsBlas_ctpsv = 70,
+ RsBlas_zgemv = 71,
+ RsBlas_zgbmv = 72,
+ RsBlas_ztrmv = 73,
+ RsBlas_ztbmv = 74,
+ RsBlas_ztpmv = 75,
+ RsBlas_ztrsv = 76,
+ RsBlas_ztbsv = 77,
+ RsBlas_ztpsv = 78,
+ RsBlas_ssymv = 79,
+ RsBlas_ssbmv = 80,
+ RsBlas_sspmv = 81,
+ RsBlas_sger = 82,
+ RsBlas_ssyr = 83,
+ RsBlas_sspr = 84,
+ RsBlas_ssyr2 = 85,
+ RsBlas_sspr2 = 86,
+ RsBlas_dsymv = 87,
+ RsBlas_dsbmv = 88,
+ RsBlas_dspmv = 89,
+ RsBlas_dger = 90,
+ RsBlas_dsyr = 91,
+ RsBlas_dspr = 92,
+ RsBlas_dsyr2 = 93,
+ RsBlas_dspr2 = 94,
+ RsBlas_chemv = 95,
+ RsBlas_chbmv = 96,
+ RsBlas_chpmv = 97,
+ RsBlas_cgeru = 98,
+ RsBlas_cgerc = 99,
+ RsBlas_cher = 100,
+ RsBlas_chpr = 101,
+ RsBlas_cher2 = 102,
+ RsBlas_chpr2 = 103,
+ RsBlas_zhemv = 104,
+ RsBlas_zhbmv = 105,
+ RsBlas_zhpmv = 106,
+ RsBlas_zgeru = 107,
+ RsBlas_zgerc = 108,
+ RsBlas_zher = 109,
+ RsBlas_zhpr = 110,
+ RsBlas_zher2 = 111,
+ RsBlas_zhpr2 = 112,
+ RsBlas_sgemm = 113,
+ RsBlas_ssymm = 114,
+ RsBlas_ssyrk = 115,
+ RsBlas_ssyr2k = 116,
+ RsBlas_strmm = 117,
+ RsBlas_strsm = 118,
+ RsBlas_dgemm = 119,
+ RsBlas_dsymm = 120,
+ RsBlas_dsyrk = 121,
+ RsBlas_dsyr2k = 122,
+ RsBlas_dtrmm = 123,
+ RsBlas_dtrsm = 124,
+ RsBlas_cgemm = 125,
+ RsBlas_csymm = 126,
+ RsBlas_csyrk = 127,
+ RsBlas_csyr2k = 128,
+ RsBlas_ctrmm = 129,
+ RsBlas_ctrsm = 130,
+ RsBlas_zgemm = 131,
+ RsBlas_zsymm = 132,
+ RsBlas_zsyrk = 133,
+ RsBlas_zsyr2k = 134,
+ RsBlas_ztrmm = 135,
+ RsBlas_ztrsm = 136,
+ RsBlas_chemm = 137,
+ RsBlas_cherk = 138,
+ RsBlas_cher2k = 139,
+ RsBlas_zhemm = 140,
+ RsBlas_zherk = 141,
+ RsBlas_zher2k = 142,
+
+ // BLAS extensions start here
+ RsBlas_bgemm = 1000,
};
// custom complex types because of NDK support
@@ -432,7 +435,7 @@
double i;
} RsDoubleComplex;
-typedef union {
+typedef union {
float f;
RsFloatComplex c;
double d;
@@ -455,8 +458,12 @@
int incY;
int KL;
int KU;
+ uint32_t a_offset;
+ uint32_t b_offset;
+ uint32_t c_offset;
+ uint32_t c_mult_int;
} RsBlasCall;
-
+
#ifdef __cplusplus
};
#endif
diff --git a/rsInternalDefines.h b/rsInternalDefines.h
index 8a62e40..46b848f 100644
--- a/rsInternalDefines.h
+++ b/rsInternalDefines.h
@@ -190,6 +190,7 @@
// unused 10, 11
RS_SCRIPT_INTRINSIC_ID_RESIZE = 12,
RS_SCRIPT_INTRINSIC_ID_BLAS = 13,
+ RS_SCRIPT_INTRINSIC_ID_EXTBLAS = 14,
RS_SCRIPT_INTRINSIC_ID_OEM_START = 0x10000000
};