Add BLAS to supported intrinsics.

Change-Id: I8e776b2ffdbac09a73924035eee2eca0a12facb3
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 6599932..7c63c95 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -28,6 +28,7 @@
 	rsCpuScriptGroup2.cpp \
 	rsCpuIntrinsic.cpp \
 	rsCpuIntrinsic3DLUT.cpp \
+	rsCpuIntrinsicBLAS.cpp \
 	rsCpuIntrinsicBlend.cpp \
 	rsCpuIntrinsicBlur.cpp \
 	rsCpuIntrinsicColorMatrix.cpp \
@@ -82,12 +83,12 @@
 
 LOCAL_SHARED_LIBRARIES += libRS libcutils libutils liblog libsync libc++ libdl
 
-# these are not supported in 64-bit yet
-LOCAL_SHARED_LIBRARIES += libbcc libbcinfo
+LOCAL_SHARED_LIBRARIES += libbcc libbcinfo libblas
 
 
 LOCAL_C_INCLUDES += frameworks/compile/libbcc/include
 LOCAL_C_INCLUDES += frameworks/rs
+LOCAL_C_INCLUDES += external/cblas/include
 
 ifneq ($(HOST_OS),windows)
 include external/libcxx/libcxx.mk
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 0ec7b28..2492c22 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -642,6 +642,8 @@
                                                  const Script *s, const Element *e);
 extern RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx,
                                               const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
+                                              const Script *s, const Element *e);
 
 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
                                     RsScriptIntrinsicID iid, Element *e) {
@@ -678,6 +680,11 @@
     case RS_SCRIPT_INTRINSIC_ID_RESIZE:
         i = rsdIntrinsic_Resize(this, s, e);
         break;
+#if !defined(RS_COMPATIBILITY_LIB)
+    case RS_SCRIPT_INTRINSIC_ID_BLAS:
+        i = rsdIntrinsic_BLAS(this, s, e);
+        break;
+#endif
 
     default:
         rsAssert(0);
diff --git a/cpu_ref/rsCpuIntrinsicBLAS.cpp b/cpu_ref/rsCpuIntrinsicBLAS.cpp
new file mode 100644
index 0000000..486eed8
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicBLAS.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+#include "cblas.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicBLAS : public RsdCpuScriptIntrinsic {
+public:
+    virtual void invokeForEach(uint32_t slot,
+                               const Allocation ** ain,
+                               uint32_t inLen,
+                               Allocation * aout,
+                               const void * usr,
+                               uint32_t usrLen,
+                               const RsScriptCall *sc);
+
+    virtual void populateScript(Script *);
+    virtual ~RsdCpuScriptIntrinsicBLAS();
+    RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+
+
+};
+
+}
+}
+
+void RsdCpuScriptIntrinsicBLAS::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 0;
+}
+
+static void initABC(const Allocation ** ain,
+                    size_t size,
+                    void** A,
+                    void** B,
+                    void** C,
+                    int* lda,
+                    int* ldb,
+                    int* ldc)
+{
+    if (ain[0]) {
+        *A = ain[0]->mHal.drvState.lod[0].mallocPtr;
+        *lda = (int)(ain[0]->mHal.drvState.lod[0].stride/size);
+    }
+    if (ain[1]) {
+        *B = ain[1]->mHal.drvState.lod[0].mallocPtr;
+        *ldb = (int)(ain[1]->mHal.drvState.lod[0].stride/size);
+    }
+    if (ain[2]) {
+        *C = ain[2]->mHal.drvState.lod[0].mallocPtr;
+        *ldc = (int)(ain[2]->mHal.drvState.lod[0].stride/size);
+    }
+
+
+}
+
+void RsdCpuScriptIntrinsicBLAS::invokeForEach(uint32_t slot,
+                                              const Allocation ** ain,
+                                              uint32_t inLen,
+                                              Allocation * aout,
+                                              const void * usr,
+                                              uint32_t usrLen,
+                                              const RsScriptCall *sc) {
+    RsBlasCall* call = (RsBlasCall*) usr;
+    // setup BLAS enum args
+    enum CBLAS_TRANSPOSE TransA = (enum CBLAS_TRANSPOSE)call->transA;
+    enum CBLAS_TRANSPOSE TransB = (enum CBLAS_TRANSPOSE)call->transB;
+    enum CBLAS_UPLO Uplo = (enum CBLAS_UPLO)call->uplo;
+    enum CBLAS_DIAG Diag = (enum CBLAS_DIAG)call->diag;
+    enum CBLAS_SIDE Side = (enum CBLAS_SIDE)call->side;
+
+    void *A = nullptr;
+    void *B = nullptr;
+    void *C = nullptr;
+    void *X = nullptr;
+    void *Y = nullptr;
+
+    int lda = 0, ldb = 0, ldc = 0;
+
+    switch (call->func) {
+
+    // Level 1 BLAS: returns into a 1D Allocation
+
+
+    // Level 2 BLAS
+    case (RsBlas_sgemv):
+        initABC(ain, sizeof(float), &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_sgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.f, (float*)A,
+                    lda, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
+        break;
+    case (RsBlas_sgbmv):
+        initABC(ain, sizeof(float), &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_sgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
+                    call->alpha.f, (float*)A, lda, (float*)X, call->incX,
+                    call->beta.f, (float*)Y, call->incY);
+        break;
+    case (RsBlas_strmv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_strmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
+                    lda, (float*)X, call->incX);
+        break;
+    case (RsBlas_stbmv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_stbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
+                    lda, (float*)X, call->incX);
+        break;
+    // stpmv takes a packed 1D Allocation only
+    case (RsBlas_stpmv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_stpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
+                    (float*)X, call->incX);
+        break;
+    case (RsBlas_strsv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_strsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A, lda,
+                    (float*)X, call->incX);
+        break;
+    case (RsBlas_stbsv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_stbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
+                    lda, (float*)X, call->incX);
+        break;
+    case (RsBlas_stpsv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_stpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
+                    (float*)X, call->incX);
+        break;
+    case (RsBlas_dgemv):
+        initABC(ain, sizeof(double), &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_dgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.d, (double*)A,
+                    lda, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
+        break;
+    case (RsBlas_dgbmv):
+        initABC(ain, sizeof(double), &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_dgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
+                    call->alpha.d, (double*)A, lda, (double*)X, call->incX,
+                    call->beta.d, (double*)Y, call->incY);
+        break;
+    case (RsBlas_dtrmv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
+                    lda, (double*)X, call->incX);
+        break;
+    case (RsBlas_dtbmv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
+                    lda, (double*)X, call->incX);
+        break;
+    // stpmv takes a packed 1D Allocation only
+    case (RsBlas_dtpmv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
+                    (double*)X, call->incX);
+        break;
+    case (RsBlas_dtrsv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A, lda,
+                    (double*)X, call->incX);
+        break;
+    case (RsBlas_dtbsv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
+                    lda, (double*)X, call->incX);
+        break;
+    case (RsBlas_dtpsv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
+                    (double*)X, call->incX);
+        break;
+    case (RsBlas_cgemv):
+        initABC(ain, sizeof(float)*2, &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_cgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.c, (void*)A,
+                    lda, (void*)X, call->incX, (void*)&call->beta.c, (void*)Y, call->incY);
+        break;
+    case (RsBlas_cgbmv):
+        initABC(ain, sizeof(float)*2, &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_cgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
+                    (void*)&call->alpha.c, (void*)A, lda, (void*)X, call->incX,
+                    (void*)&call->beta.c, (void*)Y, call->incY);
+        break;
+    case (RsBlas_ctrmv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    case (RsBlas_ctbmv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    // stpmv takes a packed 1D Allocation only
+    case (RsBlas_ctpmv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_ctrsv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_ctbsv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    case (RsBlas_ctpsv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_zgemv):
+        initABC(ain, sizeof(double)*2, &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_zgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.z, (void*)A,
+                    lda, (void*)X, call->incX, (void*)&call->beta.z, (void*)Y, call->incY);
+        break;
+    case (RsBlas_zgbmv):
+        initABC(ain, sizeof(double)*2, &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_zgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
+                    (void*)&call->alpha.z, (void*)A, lda, (void*)X, call->incX,
+                    (void*)&call->beta.z, (void*)Y, call->incY);
+        break;
+    case (RsBlas_ztrmv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    case (RsBlas_ztbmv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    // stpmv takes a packed 1D Allocation only
+    case (RsBlas_ztpmv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_ztrsv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_ztbsv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    case (RsBlas_ztpsv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    (void*)X, call->incX);
+        break;
+
+
+    // S and D only
+    case (RsBlas_ssymv):
+        initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_ssymv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A, lda,
+                    (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
+        break;
+    case (RsBlas_ssbmv):
+        initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_ssbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.f,
+                    (float*)A, lda, (float*)X, call->incX, call->beta.f,
+                    (float*)Y, call->incY);
+        break;
+    //sspmv requires a packed 1D Allocation
+    case (RsBlas_sspmv):
+        initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_sspmv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A,
+                    (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
+        break;
+    // following calls have init reordered because A is output matrix
+    case (RsBlas_sger):
+        initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_sger(CblasRowMajor, call->M, call->N, call->alpha.f, (float*)X,
+                   call->incX, (float*)Y, call->incY, (float*)A, lda);
+        break;
+    case (RsBlas_ssyr):
+        initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_ssyr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
+                   (float*)A, lda);
+        break;
+    // sspr is packed 1D Allocation A only
+    case (RsBlas_sspr):
+        initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_sspr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
+                   (float*)A);
+        break;
+    case (RsBlas_ssyr2):
+        initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_ssyr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
+                    (float*)Y, call->incY, (float*)A, lda);
+        break;
+    // sspr2 is packed 1D Allocation A only
+    case (RsBlas_sspr2):
+        initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_sspr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
+                    (float*)Y, call->incY, (float*)A);
+        break;
+    case (RsBlas_dsymv):
+        initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_dsymv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A, lda,
+                    (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
+        break;
+    case (RsBlas_dsbmv):
+        initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_dsbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.d,
+                    (double*)A, lda, (double*)X, call->incX, call->beta.d,
+                    (double*)Y, call->incY);
+        break;
+    // dspmv requires a packed 1D Allocation
+    case (RsBlas_dspmv):
+        initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_dspmv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A,
+                    (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
+        break;
+    // following calls have init reordered because A is output matrix
+    case (RsBlas_dger):
+        initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_dger(CblasRowMajor, call->M, call->N, call->alpha.d, (double*)X,
+                   call->incX, (double*)Y, call->incY, (double*)A, lda);
+        break;
+    case (RsBlas_dsyr):
+        initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_dsyr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
+                   (double*)A, lda);
+        break;
+    // dspr is packed 1D Allocation A only
+    case (RsBlas_dspr):
+        initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_dspr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
+                   (double*)A);
+        break;
+    case (RsBlas_dsyr2):
+        initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_dsyr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
+                    (double*)Y, call->incY, (double*)A, lda);
+        break;
+    // dspr2 is packed 1D Allocation A only
+    case (RsBlas_dspr2):
+        initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_dspr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
+                    (double*)Y, call->incY, (double*)A);
+        break;
+
+    // C and Z only
+    case (RsBlas_chemv):
+        initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_chemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A, lda,
+                    X, call->incX, (void*)&call->beta.c, Y, call->incY);
+        break;
+    case (RsBlas_chbmv):
+        initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_chbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.c,
+                    A, lda, X, call->incX, (void*)&call->beta.c, Y, call->incY);
+        break;
+    case (RsBlas_chpmv):
+        initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_chpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A,
+                    X, call->incX, (void*)&call->beta.c, Y, call->incY);
+        break;
+    case (RsBlas_cgeru):
+        initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_cgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
+                    X, call->incX, Y, call->incY, A, lda);
+        break;
+    case (RsBlas_cgerc):
+        initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_cgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
+                    X, call->incX, Y, call->incY, A, lda);
+        break;
+    case (RsBlas_cher):
+        initABC(ain, sizeof(float)*2, &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_cher(CblasRowMajor, Uplo, call->N, call->alpha.f,
+                   X, call->incX, A, lda);
+        break;
+    // packed 1D Allocations only
+    case (RsBlas_chpr):
+        initABC(ain, sizeof(float)*2, &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_chpr(CblasRowMajor, Uplo, call->N, call->alpha.f, X,
+                   call->incX, A);
+        break;
+    case (RsBlas_cher2):
+        initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_cher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c,
+                   X, call->incX, Y, call->incY, A, lda);
+        break;
+    // packed 1D Allocations only
+    case (RsBlas_chpr2):
+        initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_chpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, X,
+                   call->incX, Y, call->incY, A);
+        break;
+    case (RsBlas_zhemv):
+        initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_zhemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A, lda,
+                    X, call->incX, (void*)&call->beta.z, Y, call->incY);
+        break;
+    case (RsBlas_zhbmv):
+        initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_zhbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.z,
+                    A, lda, X, call->incX, (void*)&call->beta.z, Y, call->incY);
+        break;
+    case (RsBlas_zhpmv):
+        initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_zhpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A,
+                    X, call->incX, (void*)&call->beta.z, Y, call->incY);
+        break;
+    case (RsBlas_zgeru):
+        initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_zgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
+                    X, call->incX, Y, call->incY, A, lda);
+        break;
+    case (RsBlas_zgerc):
+        initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_zgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
+                    X, call->incX, Y, call->incY, A, lda);
+        break;
+    case (RsBlas_zher):
+        initABC(ain, sizeof(double)*2, &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_zher(CblasRowMajor, Uplo, call->N, call->alpha.d,
+                   X, call->incX, A, lda);
+        break;
+    // packed 1D Allocations only
+    case (RsBlas_zhpr):
+        initABC(ain, sizeof(double)*2, &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_zhpr(CblasRowMajor, Uplo, call->N, call->alpha.d, X,
+                   call->incX, A);
+        break;
+    case (RsBlas_zher2):
+        initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_zher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z,
+                   X, call->incX, Y, call->incY, A, lda);
+        break;
+    // packed 1D Allocations only
+    case (RsBlas_zhpr2):
+        initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_zhpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, X,
+                   call->incX, Y, call->incY, A);
+        break;
+
+    // Level 3 BLAS
+    case (RsBlas_sgemm):
+        initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
+        ALOGE("call->M = %d, call->N = %d, call->K = %d, lda = %d, ldb = %d, ldc = %d", call->M, call->N, call->K, lda, ldb, ldc);
+        cblas_sgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.f,
+                    (float*)A, lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
+        break;
+    case (RsBlas_ssymm):
+        initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_ssymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.f, (float*)A,
+                    lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
+        break;
+    case (RsBlas_ssyrk):
+        initABC(ain, sizeof(float), &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_ssyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
+                    lda, call->beta.f, (float*)C, ldc);
+        break;
+    case (RsBlas_ssyr2k):
+        initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_ssyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
+                     lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
+        break;
+    case (RsBlas_strmm):
+        initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_strmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
+                    (float*)A, lda, (float*)B, ldb);
+        break;
+    case (RsBlas_strsm):
+        initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_strsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
+                    (float*)A, lda, (float*)B, ldb);
+        break;
+
+
+    case (RsBlas_dgemm):
+        initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_dgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.d,
+                    (double*)A, lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
+        break;
+    case (RsBlas_dsymm):
+        initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_dsymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.d, (double*)A,
+                    lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
+        break;
+    case (RsBlas_dsyrk):
+        initABC(ain, sizeof(double), &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_dsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
+                    lda, call->beta.d, (double*)C, ldc);
+        break;
+    case (RsBlas_dsyr2k):
+        initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_dsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
+                     lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
+        break;
+    case (RsBlas_dtrmm):
+        initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_dtrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
+                    (double*)A, lda, (double*)B, ldb);
+        break;
+    case (RsBlas_dtrsm):
+        initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_dtrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
+                    (double*)A, lda, (double*)B, ldb);
+        break;
+
+    case (RsBlas_cgemm):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_cgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.c,
+                    A, lda, B, ldb, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_csymm):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_csymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A,
+                    lda, B, ldb, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_csyrk):
+        initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_csyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
+                    lda, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_csyr2k):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_csyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
+                     lda, B, ldb, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_ctrmm):
+        initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_ctrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
+                    A, lda, B, ldb);
+        break;
+    case (RsBlas_ctrsm):
+        initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_ctrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
+                    A, lda, B, ldb);
+        break;
+
+    case (RsBlas_zgemm):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.z,
+                    A, lda, B, ldb, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_zsymm):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zsymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A,
+                    lda, B, ldb, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_zsyrk):
+        initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_zsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
+                    lda, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_zsyr2k):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
+                     lda, B, ldb, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_ztrmm):
+        initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_ztrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
+                    A, lda, B, ldb);
+        break;
+    case (RsBlas_ztrsm):
+        initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_ztrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
+                    A, lda, B, ldb);
+        break;
+
+    // Level 3 C and Z only
+    case (RsBlas_chemm):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_chemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A, lda,
+                    B, ldb, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_cherk):
+        initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_cherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, A, lda,
+                    call->beta.f, C, ldc);
+        break;
+    case (RsBlas_cher2k):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_cher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A, lda,
+                     B, ldb, call->beta.f, C, ldc);
+        break;
+
+    case (RsBlas_zhemm):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zhemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A, lda,
+                    B, ldb, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_zherk):
+        initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_zherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, A, lda,
+                    call->beta.d, C, ldc);
+        break;
+    case (RsBlas_zher2k):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A, lda,
+                     B, ldb, call->beta.d, C, ldc);
+        break;
+
+    default:
+        ALOGE("unimplemented\n");
+    }
+
+
+}
+
+
+RsdCpuScriptIntrinsicBLAS::RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx,
+                                                   const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, nullptr, RS_SCRIPT_INTRINSIC_ID_BLAS) {
+
+
+}
+
+RsdCpuScriptIntrinsicBLAS::~RsdCpuScriptIntrinsicBLAS() {
+}
+
+
+
+
+
+RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
+                                    const Script *s, const Element *e) {
+
+    return new RsdCpuScriptIntrinsicBLAS(ctx, s);
+}
diff --git a/rsDefines.h b/rsDefines.h
index 69a62d6..4ccdeb8 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -254,7 +254,209 @@
     RS_CONTEXT_LOW_POWER        = 0x0004
 };
 
+enum RsBlasTranspose {
+    RsBlasNoTrans=111,
+    RsBlasTrans=112,
+    RsBlasConjTrans=113
+};
 
+enum RsBlasUplo {
+    RsBlasUpper=121,
+    RsBlasLower=122
+};
+
+enum RsBlasDiag {
+    RsBlasNonUnit=131,
+    RsBlasUnit=132
+};
+
+enum RsBlasSide {
+    RsBlasLeft=141,
+    RsBlasRight=142
+};
+
+enum RsBlasFunction {
+    RsBlas_nop = 0,
+    RsBlas_sdsdot,
+    RsBlas_dsdot,
+    RsBlas_sdot,
+    RsBlas_ddot,
+    RsBlas_cdotu_sub,
+    RsBlas_cdotc_sub,
+    RsBlas_zdotu_sub,
+    RsBlas_zdotc_sub,
+    RsBlas_snrm2,
+    RsBlas_sasum,
+    RsBlas_dnrm2,
+    RsBlas_dasum,
+    RsBlas_scnrm2,
+    RsBlas_scasum,
+    RsBlas_dznrm2,
+    RsBlas_dzasum,
+    RsBlas_isamax,
+    RsBlas_idamax,
+    RsBlas_icamax,
+    RsBlas_izamax,
+    RsBlas_sswap,
+    RsBlas_scopy,
+    RsBlas_saxpy,
+    RsBlas_dswap,
+    RsBlas_dcopy,
+    RsBlas_daxpy,
+    RsBlas_cswap,
+    RsBlas_ccopy,
+    RsBlas_caxpy,
+    RsBlas_zswap,
+    RsBlas_zcopy,
+    RsBlas_zaxpy,
+    RsBlas_srotg,
+    RsBlas_srotmg,
+    RsBlas_srot,
+    RsBlas_srotm,
+    RsBlas_drotg,
+    RsBlas_drotmg,
+    RsBlas_drot,
+    RsBlas_drotm,
+    RsBlas_sscal,
+    RsBlas_dscal,
+    RsBlas_cscal,
+    RsBlas_zscal,
+    RsBlas_csscal,
+    RsBlas_zdscal,
+    RsBlas_sgemv,
+    RsBlas_sgbmv,
+    RsBlas_strmv,
+    RsBlas_stbmv,
+    RsBlas_stpmv,
+    RsBlas_strsv,
+    RsBlas_stbsv,
+    RsBlas_stpsv,
+    RsBlas_dgemv,
+    RsBlas_dgbmv,
+    RsBlas_dtrmv,
+    RsBlas_dtbmv,
+    RsBlas_dtpmv,
+    RsBlas_dtrsv,
+    RsBlas_dtbsv,
+    RsBlas_dtpsv,
+    RsBlas_cgemv,
+    RsBlas_cgbmv,
+    RsBlas_ctrmv,
+    RsBlas_ctbmv,
+    RsBlas_ctpmv,
+    RsBlas_ctrsv,
+    RsBlas_ctbsv,
+    RsBlas_ctpsv,
+    RsBlas_zgemv,
+    RsBlas_zgbmv,
+    RsBlas_ztrmv,
+    RsBlas_ztbmv,
+    RsBlas_ztpmv,
+    RsBlas_ztrsv,
+    RsBlas_ztbsv,
+    RsBlas_ztpsv,
+    RsBlas_ssymv,
+    RsBlas_ssbmv,
+    RsBlas_sspmv,
+    RsBlas_sger,
+    RsBlas_ssyr,
+    RsBlas_sspr,
+    RsBlas_ssyr2,
+    RsBlas_sspr2,
+    RsBlas_dsymv,
+    RsBlas_dsbmv,
+    RsBlas_dspmv,
+    RsBlas_dger,
+    RsBlas_dsyr,
+    RsBlas_dspr,
+    RsBlas_dsyr2,
+    RsBlas_dspr2,
+    RsBlas_chemv,
+    RsBlas_chbmv,
+    RsBlas_chpmv,
+    RsBlas_cgeru,
+    RsBlas_cgerc,
+    RsBlas_cher,
+    RsBlas_chpr,
+    RsBlas_cher2,
+    RsBlas_chpr2,
+    RsBlas_zhemv,
+    RsBlas_zhbmv,
+    RsBlas_zhpmv,
+    RsBlas_zgeru,
+    RsBlas_zgerc,
+    RsBlas_zher,
+    RsBlas_zhpr,
+    RsBlas_zher2,
+    RsBlas_zhpr2,
+    RsBlas_sgemm,
+    RsBlas_ssymm,
+    RsBlas_ssyrk,
+    RsBlas_ssyr2k,
+    RsBlas_strmm,
+    RsBlas_strsm,
+    RsBlas_dgemm,
+    RsBlas_dsymm,
+    RsBlas_dsyrk,
+    RsBlas_dsyr2k,
+    RsBlas_dtrmm,
+    RsBlas_dtrsm,
+    RsBlas_cgemm,
+    RsBlas_csymm,
+    RsBlas_csyrk,
+    RsBlas_csyr2k,
+    RsBlas_ctrmm,
+    RsBlas_ctrsm,
+    RsBlas_zgemm,
+    RsBlas_zsymm,
+    RsBlas_zsyrk,
+    RsBlas_zsyr2k,
+    RsBlas_ztrmm,
+    RsBlas_ztrsm,
+    RsBlas_chemm,
+    RsBlas_cherk,
+    RsBlas_cher2k,
+    RsBlas_zhemm,
+    RsBlas_zherk,
+    RsBlas_zher2k
+};
+
+// custom complex types because of NDK support
+typedef struct {
+    float r;
+    float i;
+} RsFloatComplex;
+
+typedef struct {
+    double r;
+    double i;
+} RsDoubleComplex;
+
+typedef union { 
+    float f;
+    RsFloatComplex c;
+    double d;
+    RsDoubleComplex z;
+} RsBlasScalar;
+
+typedef struct {
+    RsBlasFunction func;
+    RsBlasTranspose transA;
+    RsBlasTranspose transB;
+    RsBlasUplo uplo;
+    RsBlasDiag diag;
+    RsBlasSide side;
+    int M;
+    int N;
+    int K;
+    RsBlasScalar alpha;
+    RsBlasScalar beta;
+    int incX;
+    int incY;
+    int KL;
+    int KU;
+} RsBlasCall;
+          
 #ifdef __cplusplus
 };
 #endif
diff --git a/rsInternalDefines.h b/rsInternalDefines.h
index 2a3f3fd..8a62e40 100644
--- a/rsInternalDefines.h
+++ b/rsInternalDefines.h
@@ -189,7 +189,7 @@
     RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9,
     // unused 10, 11
     RS_SCRIPT_INTRINSIC_ID_RESIZE = 12,
-
+    RS_SCRIPT_INTRINSIC_ID_BLAS = 13,
     RS_SCRIPT_INTRINSIC_ID_OEM_START = 0x10000000
 };