Merge "Improve the documentation of the RenderScript API."
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index e7e7e59..e079678 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -1,4 +1,3 @@
-
 LOCAL_PATH:=$(call my-dir)
 
 rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable \
@@ -19,26 +18,26 @@
 LOCAL_MODULE_TARGET_ARCH := arm mips mips64 x86 x86_64 arm64
 
 LOCAL_SRC_FILES:= \
-	rsCpuCore.cpp \
-	rsCpuExecutable.cpp \
-	rsCpuScript.cpp \
-	rsCpuRuntimeMath.cpp \
-	rsCpuRuntimeMathFuncs.cpp \
-	rsCpuRuntimeStubs.cpp \
-	rsCpuScriptGroup.cpp \
-	rsCpuScriptGroup2.cpp \
-	rsCpuIntrinsic.cpp \
-	rsCpuIntrinsic3DLUT.cpp \
-	rsCpuIntrinsicBLAS.cpp \
-	rsCpuIntrinsicBlend.cpp \
-	rsCpuIntrinsicBlur.cpp \
-	rsCpuIntrinsicColorMatrix.cpp \
-	rsCpuIntrinsicConvolve3x3.cpp \
-	rsCpuIntrinsicConvolve5x5.cpp \
-	rsCpuIntrinsicHistogram.cpp \
-	rsCpuIntrinsicResize.cpp \
-	rsCpuIntrinsicLUT.cpp \
-	rsCpuIntrinsicYuvToRGB.cpp
+        rsCpuCore.cpp \
+        rsCpuExecutable.cpp \
+        rsCpuScript.cpp \
+        rsCpuRuntimeMath.cpp \
+        rsCpuRuntimeMathFuncs.cpp \
+        rsCpuRuntimeStubs.cpp \
+        rsCpuScriptGroup.cpp \
+        rsCpuScriptGroup2.cpp \
+        rsCpuIntrinsic.cpp \
+        rsCpuIntrinsic3DLUT.cpp \
+        rsCpuIntrinsicBLAS.cpp \
+        rsCpuIntrinsicBlend.cpp \
+        rsCpuIntrinsicBlur.cpp \
+        rsCpuIntrinsicColorMatrix.cpp \
+        rsCpuIntrinsicConvolve3x3.cpp \
+        rsCpuIntrinsicConvolve5x5.cpp \
+        rsCpuIntrinsicHistogram.cpp \
+        rsCpuIntrinsicResize.cpp \
+        rsCpuIntrinsicLUT.cpp \
+        rsCpuIntrinsicYuvToRGB.cpp
 
 LOCAL_CFLAGS_arm64 += -DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS -DARCH_ARM64_HAVE_NEON
 
diff --git a/cpu_ref/rsCpuIntrinsicBLAS.cpp b/cpu_ref/rsCpuIntrinsicBLAS.cpp
index 486eed8..8aa9ddc 100644
--- a/cpu_ref/rsCpuIntrinsicBLAS.cpp
+++ b/cpu_ref/rsCpuIntrinsicBLAS.cpp
@@ -42,6 +42,17 @@
 
 protected:
 
+    uint8_t a_offset = 0;
+    uint8_t b_offset = 0;
+    uint8_t c_offset = 0;
+
+    static void kernelBGEMM(size_t m, size_t n, size_t k,
+                            const uint8_t* a, uint32_t a_offset, size_t lda,
+                            const uint8_t* b, uint32_t b_offset, size_t ldb,
+                            uint8_t* c, uint32_t c_offset, size_t ldc,
+                            uint32_t c_mult_int);
+
+
 
 };
 
@@ -624,6 +635,17 @@
                      B, ldb, call->beta.d, C, ldc);
         break;
 
+
+    case (RsBlas_bgemm):
+        initABC(ain, sizeof(uint8_t), &A, &B, &C, &lda, &ldb, &ldc);
+        kernelBGEMM(call->M, call->N, call->K,
+                    (const uint8_t*)A, call->a_offset, lda,
+                    (const uint8_t*)B, call->b_offset, ldb,
+                    (uint8_t*)C, call->c_offset, ldc,
+                    call->c_mult_int);
+
+        break;
+
     default:
         ALOGE("unimplemented\n");
     }
@@ -631,6 +653,45 @@
 
 }
 
+void RsdCpuScriptIntrinsicBLAS::kernelBGEMM(size_t m, size_t n, size_t k,
+                                            const uint8_t* a, uint32_t a_offset, size_t lda,
+                                            const uint8_t* b, uint32_t b_offset, size_t ldb,
+                                            uint8_t* c, uint32_t c_offset, size_t ldc,
+                                            uint32_t c_mult_int) {
+
+    const int c_shift = 23;
+    size_t i = 0, j = 0, l = 0;
+    for (j = 0; j < n; j++) {
+        for (i = 0; i < m; i++) {
+            int32_t total = 0;
+            for (l = 0; l < k; l++) {
+                const int a_index = ((i * lda) + l);
+                const uint8_t a_as_byte = a[a_index];
+                const int32_t a_as_int = (((int32_t)(a_as_byte)) - a_offset);
+                const int b_index = ((j * ldb) + l);
+                const uint8_t b_as_byte = b[b_index];
+                const int32_t b_as_int = (((int32_t)(b_as_byte)) - b_offset);
+                const int32_t mult_as_int = (a_as_int * b_as_int);
+                total += mult_as_int;
+            }
+            const int c_index = ((ldc * i) + j);
+            int32_t output =
+                ((((total + c_offset) * c_mult_int) + (1 << (c_shift - 1)))
+                 >> c_shift);
+            if (output > 255) {
+                output = 255;
+            }
+            if (output < 0) {
+                output = 0;
+            }
+            c[c_index] = (uint8_t)(output);
+        }
+    }
+}
+
+
+
+
 
 RsdCpuScriptIntrinsicBLAS::RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx,
                                                    const Script *s)
diff --git a/rsDefines.h b/rsDefines.h
index 4ccdeb8..8b334c6 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -277,148 +277,151 @@
 
 enum RsBlasFunction {
     RsBlas_nop = 0,
-    RsBlas_sdsdot,
-    RsBlas_dsdot,
-    RsBlas_sdot,
-    RsBlas_ddot,
-    RsBlas_cdotu_sub,
-    RsBlas_cdotc_sub,
-    RsBlas_zdotu_sub,
-    RsBlas_zdotc_sub,
-    RsBlas_snrm2,
-    RsBlas_sasum,
-    RsBlas_dnrm2,
-    RsBlas_dasum,
-    RsBlas_scnrm2,
-    RsBlas_scasum,
-    RsBlas_dznrm2,
-    RsBlas_dzasum,
-    RsBlas_isamax,
-    RsBlas_idamax,
-    RsBlas_icamax,
-    RsBlas_izamax,
-    RsBlas_sswap,
-    RsBlas_scopy,
-    RsBlas_saxpy,
-    RsBlas_dswap,
-    RsBlas_dcopy,
-    RsBlas_daxpy,
-    RsBlas_cswap,
-    RsBlas_ccopy,
-    RsBlas_caxpy,
-    RsBlas_zswap,
-    RsBlas_zcopy,
-    RsBlas_zaxpy,
-    RsBlas_srotg,
-    RsBlas_srotmg,
-    RsBlas_srot,
-    RsBlas_srotm,
-    RsBlas_drotg,
-    RsBlas_drotmg,
-    RsBlas_drot,
-    RsBlas_drotm,
-    RsBlas_sscal,
-    RsBlas_dscal,
-    RsBlas_cscal,
-    RsBlas_zscal,
-    RsBlas_csscal,
-    RsBlas_zdscal,
-    RsBlas_sgemv,
-    RsBlas_sgbmv,
-    RsBlas_strmv,
-    RsBlas_stbmv,
-    RsBlas_stpmv,
-    RsBlas_strsv,
-    RsBlas_stbsv,
-    RsBlas_stpsv,
-    RsBlas_dgemv,
-    RsBlas_dgbmv,
-    RsBlas_dtrmv,
-    RsBlas_dtbmv,
-    RsBlas_dtpmv,
-    RsBlas_dtrsv,
-    RsBlas_dtbsv,
-    RsBlas_dtpsv,
-    RsBlas_cgemv,
-    RsBlas_cgbmv,
-    RsBlas_ctrmv,
-    RsBlas_ctbmv,
-    RsBlas_ctpmv,
-    RsBlas_ctrsv,
-    RsBlas_ctbsv,
-    RsBlas_ctpsv,
-    RsBlas_zgemv,
-    RsBlas_zgbmv,
-    RsBlas_ztrmv,
-    RsBlas_ztbmv,
-    RsBlas_ztpmv,
-    RsBlas_ztrsv,
-    RsBlas_ztbsv,
-    RsBlas_ztpsv,
-    RsBlas_ssymv,
-    RsBlas_ssbmv,
-    RsBlas_sspmv,
-    RsBlas_sger,
-    RsBlas_ssyr,
-    RsBlas_sspr,
-    RsBlas_ssyr2,
-    RsBlas_sspr2,
-    RsBlas_dsymv,
-    RsBlas_dsbmv,
-    RsBlas_dspmv,
-    RsBlas_dger,
-    RsBlas_dsyr,
-    RsBlas_dspr,
-    RsBlas_dsyr2,
-    RsBlas_dspr2,
-    RsBlas_chemv,
-    RsBlas_chbmv,
-    RsBlas_chpmv,
-    RsBlas_cgeru,
-    RsBlas_cgerc,
-    RsBlas_cher,
-    RsBlas_chpr,
-    RsBlas_cher2,
-    RsBlas_chpr2,
-    RsBlas_zhemv,
-    RsBlas_zhbmv,
-    RsBlas_zhpmv,
-    RsBlas_zgeru,
-    RsBlas_zgerc,
-    RsBlas_zher,
-    RsBlas_zhpr,
-    RsBlas_zher2,
-    RsBlas_zhpr2,
-    RsBlas_sgemm,
-    RsBlas_ssymm,
-    RsBlas_ssyrk,
-    RsBlas_ssyr2k,
-    RsBlas_strmm,
-    RsBlas_strsm,
-    RsBlas_dgemm,
-    RsBlas_dsymm,
-    RsBlas_dsyrk,
-    RsBlas_dsyr2k,
-    RsBlas_dtrmm,
-    RsBlas_dtrsm,
-    RsBlas_cgemm,
-    RsBlas_csymm,
-    RsBlas_csyrk,
-    RsBlas_csyr2k,
-    RsBlas_ctrmm,
-    RsBlas_ctrsm,
-    RsBlas_zgemm,
-    RsBlas_zsymm,
-    RsBlas_zsyrk,
-    RsBlas_zsyr2k,
-    RsBlas_ztrmm,
-    RsBlas_ztrsm,
-    RsBlas_chemm,
-    RsBlas_cherk,
-    RsBlas_cher2k,
-    RsBlas_zhemm,
-    RsBlas_zherk,
-    RsBlas_zher2k
+    RsBlas_sdsdot = 1,
+    RsBlas_dsdot = 2,
+    RsBlas_sdot = 3,
+    RsBlas_ddot = 4,
+    RsBlas_cdotu_sub = 5,
+    RsBlas_cdotc_sub = 6,
+    RsBlas_zdotu_sub = 7,
+    RsBlas_zdotc_sub = 8,
+    RsBlas_snrm2 = 9,
+    RsBlas_sasum = 10,
+    RsBlas_dnrm2 = 11,
+    RsBlas_dasum = 12,
+    RsBlas_scnrm2 = 13,
+    RsBlas_scasum = 14,
+    RsBlas_dznrm2 = 15,
+    RsBlas_dzasum = 16,
+    RsBlas_isamax = 17,
+    RsBlas_idamax = 18,
+    RsBlas_icamax = 19,
+    RsBlas_izamax = 20,
+    RsBlas_sswap = 21,
+    RsBlas_scopy = 22,
+    RsBlas_saxpy = 23,
+    RsBlas_dswap = 24,
+    RsBlas_dcopy = 25,
+    RsBlas_daxpy = 26,
+    RsBlas_cswap = 27,
+    RsBlas_ccopy = 28,
+    RsBlas_caxpy = 29,
+    RsBlas_zswap = 30,
+    RsBlas_zcopy = 31,
+    RsBlas_zaxpy = 32,
+    RsBlas_srotg = 33,
+    RsBlas_srotmg = 34,
+    RsBlas_srot = 35,
+    RsBlas_srotm = 36,
+    RsBlas_drotg = 37,
+    RsBlas_drotmg = 38,
+    RsBlas_drot = 39,
+    RsBlas_drotm = 40,
+    RsBlas_sscal = 41,
+    RsBlas_dscal = 42,
+    RsBlas_cscal = 43,
+    RsBlas_zscal = 44,
+    RsBlas_csscal = 45,
+    RsBlas_zdscal = 46,
+    RsBlas_sgemv = 47,
+    RsBlas_sgbmv = 48,
+    RsBlas_strmv = 49,
+    RsBlas_stbmv = 50,
+    RsBlas_stpmv = 51,
+    RsBlas_strsv = 52,
+    RsBlas_stbsv = 53,
+    RsBlas_stpsv = 54,
+    RsBlas_dgemv = 55,
+    RsBlas_dgbmv = 56,
+    RsBlas_dtrmv = 57,
+    RsBlas_dtbmv = 58,
+    RsBlas_dtpmv = 59,
+    RsBlas_dtrsv = 60,
+    RsBlas_dtbsv = 61,
+    RsBlas_dtpsv = 62,
+    RsBlas_cgemv = 63,
+    RsBlas_cgbmv = 64,
+    RsBlas_ctrmv = 65,
+    RsBlas_ctbmv = 66,
+    RsBlas_ctpmv = 67,
+    RsBlas_ctrsv = 68,
+    RsBlas_ctbsv = 69,
+    RsBlas_ctpsv = 70,
+    RsBlas_zgemv = 71,
+    RsBlas_zgbmv = 72,
+    RsBlas_ztrmv = 73,
+    RsBlas_ztbmv = 74,
+    RsBlas_ztpmv = 75,
+    RsBlas_ztrsv = 76,
+    RsBlas_ztbsv = 77,
+    RsBlas_ztpsv = 78,
+    RsBlas_ssymv = 79,
+    RsBlas_ssbmv = 80,
+    RsBlas_sspmv = 81,
+    RsBlas_sger = 82,
+    RsBlas_ssyr = 83,
+    RsBlas_sspr = 84,
+    RsBlas_ssyr2 = 85,
+    RsBlas_sspr2 = 86,
+    RsBlas_dsymv = 87,
+    RsBlas_dsbmv = 88,
+    RsBlas_dspmv = 89,
+    RsBlas_dger = 90,
+    RsBlas_dsyr = 91,
+    RsBlas_dspr = 92,
+    RsBlas_dsyr2 = 93,
+    RsBlas_dspr2 = 94,
+    RsBlas_chemv = 95,
+    RsBlas_chbmv = 96,
+    RsBlas_chpmv = 97,
+    RsBlas_cgeru = 98,
+    RsBlas_cgerc = 99,
+    RsBlas_cher = 100,
+    RsBlas_chpr = 101,
+    RsBlas_cher2 = 102,
+    RsBlas_chpr2 = 103,
+    RsBlas_zhemv = 104,
+    RsBlas_zhbmv = 105,
+    RsBlas_zhpmv = 106,
+    RsBlas_zgeru = 107,
+    RsBlas_zgerc = 108,
+    RsBlas_zher = 109,
+    RsBlas_zhpr = 110,
+    RsBlas_zher2 = 111,
+    RsBlas_zhpr2 = 112,
+    RsBlas_sgemm = 113,
+    RsBlas_ssymm = 114,
+    RsBlas_ssyrk = 115,
+    RsBlas_ssyr2k = 116,
+    RsBlas_strmm = 117,
+    RsBlas_strsm = 118,
+    RsBlas_dgemm = 119,
+    RsBlas_dsymm = 120,
+    RsBlas_dsyrk = 121,
+    RsBlas_dsyr2k = 122,
+    RsBlas_dtrmm = 123,
+    RsBlas_dtrsm = 124,
+    RsBlas_cgemm = 125,
+    RsBlas_csymm = 126,
+    RsBlas_csyrk = 127,
+    RsBlas_csyr2k = 128,
+    RsBlas_ctrmm = 129,
+    RsBlas_ctrsm = 130,
+    RsBlas_zgemm = 131,
+    RsBlas_zsymm = 132,
+    RsBlas_zsyrk = 133,
+    RsBlas_zsyr2k = 134,
+    RsBlas_ztrmm = 135,
+    RsBlas_ztrsm = 136,
+    RsBlas_chemm = 137,
+    RsBlas_cherk = 138,
+    RsBlas_cher2k = 139,
+    RsBlas_zhemm = 140,
+    RsBlas_zherk = 141,
+    RsBlas_zher2k = 142,
+
+    // BLAS extensions start here
+    RsBlas_bgemm = 1000,
 };
 
 // custom complex types because of NDK support
@@ -432,7 +435,7 @@
     double i;
 } RsDoubleComplex;
 
-typedef union { 
+typedef union {
     float f;
     RsFloatComplex c;
     double d;
@@ -455,8 +458,12 @@
     int incY;
     int KL;
     int KU;
+    uint32_t a_offset;
+    uint32_t b_offset;
+    uint32_t c_offset;
+    uint32_t c_mult_int;
 } RsBlasCall;
-          
+
 #ifdef __cplusplus
 };
 #endif
diff --git a/rsInternalDefines.h b/rsInternalDefines.h
index 8a62e40..46b848f 100644
--- a/rsInternalDefines.h
+++ b/rsInternalDefines.h
@@ -190,6 +190,7 @@
     // unused 10, 11
     RS_SCRIPT_INTRINSIC_ID_RESIZE = 12,
     RS_SCRIPT_INTRINSIC_ID_BLAS = 13,
+    RS_SCRIPT_INTRINSIC_ID_EXTBLAS = 14,
     RS_SCRIPT_INTRINSIC_ID_OEM_START = 0x10000000
 };