blob: 2a5ab21acc94db40186e04e5a38a86dc4961dd2a [file] [log] [blame]
Tim Murray64c682b2015-01-09 12:08:43 -08001/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
Miao Wange941f182015-07-14 16:18:49 -070020#include "rsCpuBLASDispatch.h"
Miao Wang99d0e812015-07-06 18:05:39 -070021#include "eight_bit_int_gemm.h"
Tim Murray64c682b2015-01-09 12:08:43 -080022
Tim Murray64c682b2015-01-09 12:08:43 -080023namespace android {
24namespace renderscript {
25
26
27class RsdCpuScriptIntrinsicBLAS : public RsdCpuScriptIntrinsic {
28public:
Stephen Hinesc060f142015-05-13 19:26:09 -070029 void invokeForEach(uint32_t slot,
30 const Allocation ** ain,
31 uint32_t inLen,
32 Allocation * aout,
33 const void * usr,
34 uint32_t usrLen,
35 const RsScriptCall *sc) override;
Tim Murray64c682b2015-01-09 12:08:43 -080036
Stephen Hinesc060f142015-05-13 19:26:09 -070037 void populateScript(Script *) override;
38 ~RsdCpuScriptIntrinsicBLAS() override;
Tim Murray64c682b2015-01-09 12:08:43 -080039 RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx, const Script *s);
40
41protected:
42
Tim Murrayaff74452015-03-30 15:14:36 -070043 uint8_t a_offset = 0;
44 uint8_t b_offset = 0;
45 uint8_t c_offset = 0;
46
Miao Wange941f182015-07-14 16:18:49 -070047#ifdef RS_COMPATIBILITY_LIB
48 bool isBlasLibInitialized = false;
49#endif
Tim Murray2b999882015-04-13 11:42:54 -070050 static void kernelBNNM(size_t m, size_t n, size_t k,
Miao Wang06deda32015-06-29 17:29:43 -070051 const uint8_t* a, uint8_t a_offset, size_t lda,
52 const uint8_t* b, uint8_t b_offset, size_t ldb,
53 uint8_t* c, int32_t c_offset, size_t ldc,
54 int32_t c_mult_int);
Tim Murrayaff74452015-03-30 15:14:36 -070055
56
Tim Murray64c682b2015-01-09 12:08:43 -080057
58};
59
Tim Murray64c682b2015-01-09 12:08:43 -080060void RsdCpuScriptIntrinsicBLAS::populateScript(Script *s) {
61 s->mHal.info.exportedVariableCount = 0;
62}
63
64static void initABC(const Allocation ** ain,
65 size_t size,
66 void** A,
67 void** B,
68 void** C,
69 int* lda,
70 int* ldb,
71 int* ldc)
72{
73 if (ain[0]) {
74 *A = ain[0]->mHal.drvState.lod[0].mallocPtr;
75 *lda = (int)(ain[0]->mHal.drvState.lod[0].stride/size);
76 }
77 if (ain[1]) {
78 *B = ain[1]->mHal.drvState.lod[0].mallocPtr;
79 *ldb = (int)(ain[1]->mHal.drvState.lod[0].stride/size);
80 }
81 if (ain[2]) {
82 *C = ain[2]->mHal.drvState.lod[0].mallocPtr;
83 *ldc = (int)(ain[2]->mHal.drvState.lod[0].stride/size);
84 }
85
86
87}
88
89void RsdCpuScriptIntrinsicBLAS::invokeForEach(uint32_t slot,
90 const Allocation ** ain,
91 uint32_t inLen,
92 Allocation * aout,
93 const void * usr,
94 uint32_t usrLen,
95 const RsScriptCall *sc) {
96 RsBlasCall* call = (RsBlasCall*) usr;
97 // setup BLAS enum args
98 enum CBLAS_TRANSPOSE TransA = (enum CBLAS_TRANSPOSE)call->transA;
99 enum CBLAS_TRANSPOSE TransB = (enum CBLAS_TRANSPOSE)call->transB;
100 enum CBLAS_UPLO Uplo = (enum CBLAS_UPLO)call->uplo;
101 enum CBLAS_DIAG Diag = (enum CBLAS_DIAG)call->diag;
102 enum CBLAS_SIDE Side = (enum CBLAS_SIDE)call->side;
103
104 void *A = nullptr;
105 void *B = nullptr;
106 void *C = nullptr;
107 void *X = nullptr;
108 void *Y = nullptr;
109
110 int lda = 0, ldb = 0, ldc = 0;
111
Miao Wange941f182015-07-14 16:18:49 -0700112#ifdef RS_COMPATIBILITY_LIB
113 // Allow BNNM even without libblas
114 if (call->func != RsBlas_bnnm && !isBlasLibInitialized) {
115 if (!loadBLASLib()) {
116 ALOGE("Failed to load the BLAS lib, IntrinsicBLAS NOT supported!\n");
117 return;
118 }
119 isBlasLibInitialized = true;
120 }
121#endif
122
Tim Murray64c682b2015-01-09 12:08:43 -0800123 switch (call->func) {
124
125 // Level 1 BLAS: returns into a 1D Allocation
126
127
128 // Level 2 BLAS
129 case (RsBlas_sgemv):
Miao Wangb75ba0f2015-04-26 15:57:22 -0700130 initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
Tim Murray64c682b2015-01-09 12:08:43 -0800131 cblas_sgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.f, (float*)A,
132 lda, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
133 break;
134 case (RsBlas_sgbmv):
Miao Wangb75ba0f2015-04-26 15:57:22 -0700135 initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
Tim Murray64c682b2015-01-09 12:08:43 -0800136 cblas_sgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
137 call->alpha.f, (float*)A, lda, (float*)X, call->incX,
138 call->beta.f, (float*)Y, call->incY);
139 break;
140 case (RsBlas_strmv):
141 initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
142 cblas_strmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
143 lda, (float*)X, call->incX);
144 break;
145 case (RsBlas_stbmv):
146 initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
147 cblas_stbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
148 lda, (float*)X, call->incX);
149 break;
150 // stpmv takes a packed 1D Allocation only
151 case (RsBlas_stpmv):
152 initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
153 cblas_stpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
154 (float*)X, call->incX);
155 break;
156 case (RsBlas_strsv):
157 initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
158 cblas_strsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A, lda,
159 (float*)X, call->incX);
160 break;
161 case (RsBlas_stbsv):
162 initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
163 cblas_stbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
164 lda, (float*)X, call->incX);
165 break;
166 case (RsBlas_stpsv):
167 initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
168 cblas_stpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
169 (float*)X, call->incX);
170 break;
171 case (RsBlas_dgemv):
Miao Wangb75ba0f2015-04-26 15:57:22 -0700172 initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
Tim Murray64c682b2015-01-09 12:08:43 -0800173 cblas_dgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.d, (double*)A,
174 lda, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
175 break;
176 case (RsBlas_dgbmv):
Miao Wangb75ba0f2015-04-26 15:57:22 -0700177 initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
Tim Murray64c682b2015-01-09 12:08:43 -0800178 cblas_dgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
179 call->alpha.d, (double*)A, lda, (double*)X, call->incX,
180 call->beta.d, (double*)Y, call->incY);
181 break;
182 case (RsBlas_dtrmv):
183 initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
184 cblas_dtrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
185 lda, (double*)X, call->incX);
186 break;
187 case (RsBlas_dtbmv):
188 initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
189 cblas_dtbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
190 lda, (double*)X, call->incX);
191 break;
192 // stpmv takes a packed 1D Allocation only
193 case (RsBlas_dtpmv):
194 initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
195 cblas_dtpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
196 (double*)X, call->incX);
197 break;
198 case (RsBlas_dtrsv):
199 initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
200 cblas_dtrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A, lda,
201 (double*)X, call->incX);
202 break;
203 case (RsBlas_dtbsv):
204 initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
205 cblas_dtbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
206 lda, (double*)X, call->incX);
207 break;
208 case (RsBlas_dtpsv):
209 initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
210 cblas_dtpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
211 (double*)X, call->incX);
212 break;
213 case (RsBlas_cgemv):
Miao Wangb75ba0f2015-04-26 15:57:22 -0700214 initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
Tim Murray64c682b2015-01-09 12:08:43 -0800215 cblas_cgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.c, (void*)A,
216 lda, (void*)X, call->incX, (void*)&call->beta.c, (void*)Y, call->incY);
217 break;
218 case (RsBlas_cgbmv):
Miao Wangb75ba0f2015-04-26 15:57:22 -0700219 initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
Tim Murray64c682b2015-01-09 12:08:43 -0800220 cblas_cgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
221 (void*)&call->alpha.c, (void*)A, lda, (void*)X, call->incX,
222 (void*)&call->beta.c, (void*)Y, call->incY);
223 break;
224 case (RsBlas_ctrmv):
225 initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
226 cblas_ctrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
227 lda, (void*)X, call->incX);
228 break;
229 case (RsBlas_ctbmv):
230 initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
231 cblas_ctbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
232 lda, (void*)X, call->incX);
233 break;
234 // stpmv takes a packed 1D Allocation only
235 case (RsBlas_ctpmv):
236 initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
237 cblas_ctpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
238 (void*)X, call->incX);
239 break;
240 case (RsBlas_ctrsv):
241 initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
242 cblas_ctrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
243 (void*)X, call->incX);
244 break;
245 case (RsBlas_ctbsv):
246 initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
247 cblas_ctbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
248 lda, (void*)X, call->incX);
249 break;
250 case (RsBlas_ctpsv):
251 initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
252 cblas_ctpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
253 (void*)X, call->incX);
254 break;
255 case (RsBlas_zgemv):
Miao Wangb75ba0f2015-04-26 15:57:22 -0700256 initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
Tim Murray64c682b2015-01-09 12:08:43 -0800257 cblas_zgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.z, (void*)A,
258 lda, (void*)X, call->incX, (void*)&call->beta.z, (void*)Y, call->incY);
259 break;
260 case (RsBlas_zgbmv):
Miao Wangb75ba0f2015-04-26 15:57:22 -0700261 initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
Tim Murray64c682b2015-01-09 12:08:43 -0800262 cblas_zgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
263 (void*)&call->alpha.z, (void*)A, lda, (void*)X, call->incX,
264 (void*)&call->beta.z, (void*)Y, call->incY);
265 break;
266 case (RsBlas_ztrmv):
267 initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
268 cblas_ztrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
269 lda, (void*)X, call->incX);
270 break;
271 case (RsBlas_ztbmv):
272 initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
273 cblas_ztbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
274 lda, (void*)X, call->incX);
275 break;
276 // stpmv takes a packed 1D Allocation only
277 case (RsBlas_ztpmv):
278 initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
279 cblas_ztpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
280 (void*)X, call->incX);
281 break;
282 case (RsBlas_ztrsv):
283 initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
284 cblas_ztrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
285 (void*)X, call->incX);
286 break;
287 case (RsBlas_ztbsv):
288 initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
289 cblas_ztbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
290 lda, (void*)X, call->incX);
291 break;
292 case (RsBlas_ztpsv):
293 initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
294 cblas_ztpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
295 (void*)X, call->incX);
296 break;
297
298
299 // S and D only
300 case (RsBlas_ssymv):
301 initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
302 cblas_ssymv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A, lda,
303 (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
304 break;
305 case (RsBlas_ssbmv):
306 initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
307 cblas_ssbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.f,
308 (float*)A, lda, (float*)X, call->incX, call->beta.f,
309 (float*)Y, call->incY);
310 break;
311 //sspmv requires a packed 1D Allocation
312 case (RsBlas_sspmv):
313 initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
314 cblas_sspmv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A,
315 (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
316 break;
317 // following calls have init reordered because A is output matrix
318 case (RsBlas_sger):
319 initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
320 cblas_sger(CblasRowMajor, call->M, call->N, call->alpha.f, (float*)X,
321 call->incX, (float*)Y, call->incY, (float*)A, lda);
322 break;
323 case (RsBlas_ssyr):
324 initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
325 cblas_ssyr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
326 (float*)A, lda);
327 break;
328 // sspr is packed 1D Allocation A only
329 case (RsBlas_sspr):
330 initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
331 cblas_sspr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
332 (float*)A);
333 break;
334 case (RsBlas_ssyr2):
335 initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
336 cblas_ssyr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
337 (float*)Y, call->incY, (float*)A, lda);
338 break;
339 // sspr2 is packed 1D Allocation A only
340 case (RsBlas_sspr2):
341 initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
342 cblas_sspr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
343 (float*)Y, call->incY, (float*)A);
344 break;
345 case (RsBlas_dsymv):
346 initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
347 cblas_dsymv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A, lda,
348 (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
349 break;
350 case (RsBlas_dsbmv):
351 initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
352 cblas_dsbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.d,
353 (double*)A, lda, (double*)X, call->incX, call->beta.d,
354 (double*)Y, call->incY);
355 break;
356 // dspmv requires a packed 1D Allocation
357 case (RsBlas_dspmv):
358 initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
359 cblas_dspmv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A,
360 (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
361 break;
362 // following calls have init reordered because A is output matrix
363 case (RsBlas_dger):
364 initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
365 cblas_dger(CblasRowMajor, call->M, call->N, call->alpha.d, (double*)X,
366 call->incX, (double*)Y, call->incY, (double*)A, lda);
367 break;
368 case (RsBlas_dsyr):
369 initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
370 cblas_dsyr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
371 (double*)A, lda);
372 break;
373 // dspr is packed 1D Allocation A only
374 case (RsBlas_dspr):
375 initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
376 cblas_dspr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
377 (double*)A);
378 break;
379 case (RsBlas_dsyr2):
380 initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
381 cblas_dsyr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
382 (double*)Y, call->incY, (double*)A, lda);
383 break;
384 // dspr2 is packed 1D Allocation A only
385 case (RsBlas_dspr2):
386 initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
387 cblas_dspr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
388 (double*)Y, call->incY, (double*)A);
389 break;
390
391 // C and Z only
392 case (RsBlas_chemv):
393 initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
394 cblas_chemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A, lda,
395 X, call->incX, (void*)&call->beta.c, Y, call->incY);
396 break;
397 case (RsBlas_chbmv):
398 initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
399 cblas_chbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.c,
400 A, lda, X, call->incX, (void*)&call->beta.c, Y, call->incY);
401 break;
402 case (RsBlas_chpmv):
403 initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
404 cblas_chpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A,
405 X, call->incX, (void*)&call->beta.c, Y, call->incY);
406 break;
407 case (RsBlas_cgeru):
408 initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
409 cblas_cgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
410 X, call->incX, Y, call->incY, A, lda);
411 break;
412 case (RsBlas_cgerc):
413 initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
414 cblas_cgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
415 X, call->incX, Y, call->incY, A, lda);
416 break;
417 case (RsBlas_cher):
Miao Wang08ef7b72015-04-29 18:16:51 -0700418 initABC(ain, sizeof(float)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
Tim Murray64c682b2015-01-09 12:08:43 -0800419 cblas_cher(CblasRowMajor, Uplo, call->N, call->alpha.f,
420 X, call->incX, A, lda);
421 break;
422 // packed 1D Allocations only
423 case (RsBlas_chpr):
Miao Wang08ef7b72015-04-29 18:16:51 -0700424 initABC(ain, sizeof(float)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
Tim Murray64c682b2015-01-09 12:08:43 -0800425 cblas_chpr(CblasRowMajor, Uplo, call->N, call->alpha.f, X,
426 call->incX, A);
427 break;
428 case (RsBlas_cher2):
429 initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
430 cblas_cher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c,
431 X, call->incX, Y, call->incY, A, lda);
432 break;
433 // packed 1D Allocations only
434 case (RsBlas_chpr2):
435 initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
436 cblas_chpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, X,
437 call->incX, Y, call->incY, A);
438 break;
439 case (RsBlas_zhemv):
440 initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
441 cblas_zhemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A, lda,
442 X, call->incX, (void*)&call->beta.z, Y, call->incY);
443 break;
444 case (RsBlas_zhbmv):
445 initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
446 cblas_zhbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.z,
447 A, lda, X, call->incX, (void*)&call->beta.z, Y, call->incY);
448 break;
449 case (RsBlas_zhpmv):
450 initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
451 cblas_zhpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A,
452 X, call->incX, (void*)&call->beta.z, Y, call->incY);
453 break;
454 case (RsBlas_zgeru):
455 initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
456 cblas_zgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
457 X, call->incX, Y, call->incY, A, lda);
458 break;
459 case (RsBlas_zgerc):
460 initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
461 cblas_zgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
462 X, call->incX, Y, call->incY, A, lda);
463 break;
464 case (RsBlas_zher):
Miao Wang08ef7b72015-04-29 18:16:51 -0700465 initABC(ain, sizeof(double)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
Tim Murray64c682b2015-01-09 12:08:43 -0800466 cblas_zher(CblasRowMajor, Uplo, call->N, call->alpha.d,
467 X, call->incX, A, lda);
468 break;
469 // packed 1D Allocations only
470 case (RsBlas_zhpr):
Miao Wang08ef7b72015-04-29 18:16:51 -0700471 initABC(ain, sizeof(double)*2, &X, nullptr, &A, &ldb, nullptr, &lda);
Tim Murray64c682b2015-01-09 12:08:43 -0800472 cblas_zhpr(CblasRowMajor, Uplo, call->N, call->alpha.d, X,
473 call->incX, A);
474 break;
475 case (RsBlas_zher2):
476 initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
477 cblas_zher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z,
478 X, call->incX, Y, call->incY, A, lda);
479 break;
480 // packed 1D Allocations only
481 case (RsBlas_zhpr2):
482 initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
483 cblas_zhpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, X,
484 call->incX, Y, call->incY, A);
485 break;
486
487 // Level 3 BLAS
488 case (RsBlas_sgemm):
489 initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
Tim Murray64c682b2015-01-09 12:08:43 -0800490 cblas_sgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.f,
491 (float*)A, lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
492 break;
493 case (RsBlas_ssymm):
494 initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
495 cblas_ssymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.f, (float*)A,
496 lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
497 break;
498 case (RsBlas_ssyrk):
499 initABC(ain, sizeof(float), &A, nullptr, &C, &lda, nullptr, &ldc);
500 cblas_ssyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
501 lda, call->beta.f, (float*)C, ldc);
502 break;
503 case (RsBlas_ssyr2k):
504 initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
505 cblas_ssyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
506 lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
507 break;
508 case (RsBlas_strmm):
509 initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
510 cblas_strmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
511 (float*)A, lda, (float*)B, ldb);
512 break;
513 case (RsBlas_strsm):
514 initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
515 cblas_strsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
516 (float*)A, lda, (float*)B, ldb);
517 break;
518
519
520 case (RsBlas_dgemm):
521 initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
522 cblas_dgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.d,
523 (double*)A, lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
524 break;
525 case (RsBlas_dsymm):
526 initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
527 cblas_dsymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.d, (double*)A,
528 lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
529 break;
530 case (RsBlas_dsyrk):
531 initABC(ain, sizeof(double), &A, nullptr, &C, &lda, nullptr, &ldc);
532 cblas_dsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
533 lda, call->beta.d, (double*)C, ldc);
534 break;
535 case (RsBlas_dsyr2k):
536 initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
537 cblas_dsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
538 lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
539 break;
540 case (RsBlas_dtrmm):
541 initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
542 cblas_dtrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
543 (double*)A, lda, (double*)B, ldb);
544 break;
545 case (RsBlas_dtrsm):
546 initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
547 cblas_dtrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
548 (double*)A, lda, (double*)B, ldb);
549 break;
550
551 case (RsBlas_cgemm):
552 initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
553 cblas_cgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.c,
554 A, lda, B, ldb, (void*)&call->beta.c, C, ldc);
555 break;
556 case (RsBlas_csymm):
557 initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
558 cblas_csymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A,
559 lda, B, ldb, (void*)&call->beta.c, C, ldc);
560 break;
561 case (RsBlas_csyrk):
562 initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
563 cblas_csyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
564 lda, (void*)&call->beta.c, C, ldc);
565 break;
566 case (RsBlas_csyr2k):
567 initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
568 cblas_csyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
569 lda, B, ldb, (void*)&call->beta.c, C, ldc);
570 break;
571 case (RsBlas_ctrmm):
572 initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
573 cblas_ctrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
574 A, lda, B, ldb);
575 break;
576 case (RsBlas_ctrsm):
577 initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
578 cblas_ctrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
579 A, lda, B, ldb);
580 break;
581
582 case (RsBlas_zgemm):
583 initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
584 cblas_zgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.z,
585 A, lda, B, ldb, (void*)&call->beta.z, C, ldc);
586 break;
587 case (RsBlas_zsymm):
588 initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
589 cblas_zsymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A,
590 lda, B, ldb, (void*)&call->beta.z, C, ldc);
591 break;
592 case (RsBlas_zsyrk):
593 initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
594 cblas_zsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
595 lda, (void*)&call->beta.z, C, ldc);
596 break;
597 case (RsBlas_zsyr2k):
598 initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
599 cblas_zsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
600 lda, B, ldb, (void*)&call->beta.z, C, ldc);
601 break;
602 case (RsBlas_ztrmm):
603 initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
604 cblas_ztrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
605 A, lda, B, ldb);
606 break;
607 case (RsBlas_ztrsm):
608 initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
609 cblas_ztrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
610 A, lda, B, ldb);
611 break;
612
613 // Level 3 C and Z only
614 case (RsBlas_chemm):
615 initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
616 cblas_chemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A, lda,
617 B, ldb, (void*)&call->beta.c, C, ldc);
618 break;
619 case (RsBlas_cherk):
620 initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
621 cblas_cherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, A, lda,
622 call->beta.f, C, ldc);
623 break;
624 case (RsBlas_cher2k):
625 initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
626 cblas_cher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A, lda,
627 B, ldb, call->beta.f, C, ldc);
628 break;
629
630 case (RsBlas_zhemm):
631 initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
632 cblas_zhemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A, lda,
633 B, ldb, (void*)&call->beta.z, C, ldc);
634 break;
635 case (RsBlas_zherk):
636 initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
637 cblas_zherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, A, lda,
638 call->beta.d, C, ldc);
639 break;
640 case (RsBlas_zher2k):
641 initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
642 cblas_zher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A, lda,
643 B, ldb, call->beta.d, C, ldc);
644 break;
645
Tim Murrayaff74452015-03-30 15:14:36 -0700646
Tim Murray2b999882015-04-13 11:42:54 -0700647 case (RsBlas_bnnm):
Tim Murrayaff74452015-03-30 15:14:36 -0700648 initABC(ain, sizeof(uint8_t), &A, &B, &C, &lda, &ldb, &ldc);
Tim Murray2b999882015-04-13 11:42:54 -0700649 kernelBNNM(call->M, call->N, call->K,
Tim Murrayaff74452015-03-30 15:14:36 -0700650 (const uint8_t*)A, call->a_offset, lda,
651 (const uint8_t*)B, call->b_offset, ldb,
652 (uint8_t*)C, call->c_offset, ldc,
653 call->c_mult_int);
654
655 break;
656
Tim Murray64c682b2015-01-09 12:08:43 -0800657 default:
658 ALOGE("unimplemented\n");
659 }
660
661
662}
663
Tim Murray2b999882015-04-13 11:42:54 -0700664void RsdCpuScriptIntrinsicBLAS::kernelBNNM(size_t m, size_t n, size_t k,
Miao Wang06deda32015-06-29 17:29:43 -0700665 const uint8_t* a, uint8_t a_offset, size_t lda,
666 const uint8_t* b, uint8_t b_offset, size_t ldb,
667 uint8_t* c, int32_t c_offset, size_t ldc,
668 int32_t c_mult_int) {
Tim Murray2b999882015-04-13 11:42:54 -0700669 const int c_shift = 21;
Miao Wang223231f2015-07-16 15:31:33 -0700670#if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
671 // Non-optimized path for ARMv7 devices without SIMD instructions.
672 if (!gArchUseSIMD) {
Miao Wang9195e512015-09-14 15:50:08 -0700673 /*
674 * Calculations are done in 1.10.21 fixed-point format for the final output,
675 * just before there's a shift down to drop the fractional parts. The output
676 * values are gated to 0 to 255 to fit in a byte, but the 10-bit format
677 * gives some headroom to avoid wrapping around on small overflows.
678 */
Miao Wang223231f2015-07-16 15:31:33 -0700679 size_t i = 0, j = 0, l = 0;
680 for (j = 0; j < n; j++) {
681 for (i = 0; i < m; i++) {
682 int32_t total = 0;
683 for (l = 0; l < k; l++) {
684 const int a_index = ((i * lda) + l);
685 const uint8_t a_as_byte = a[a_index];
686 const int32_t a_as_int = (((int32_t)(a_as_byte)) - a_offset);
687 const int b_index = ((j * ldb) + l);
688 const uint8_t b_as_byte = b[b_index];
689 const int32_t b_as_int = (((int32_t)(b_as_byte)) - b_offset);
690 const int32_t mult_as_int = (a_as_int * b_as_int);
691 total += mult_as_int;
692 }
693 const int c_index = ((ldc * i) + j);
694 int32_t output =
695 ((((total + c_offset) * c_mult_int) + (1 << (c_shift - 1)))
696 >> c_shift);
697 if (output > 255) {
698 output = 255;
699 }
700 if (output < 0) {
701 output = 0;
702 }
703 c[c_index] = (uint8_t)(output);
704 }
705 }
706 return;
707 }
708#endif
709
Miao Wang99d0e812015-07-06 18:05:39 -0700710 // Using gemmlowp to calculate the low precision 8 bit GEMM.
Miao Wange4f999b2016-02-03 11:23:37 -0800711 bool transpose_a = true;
712 bool transpose_b = false;
Miao Wang223231f2015-07-16 15:31:33 -0700713 bool transpose_c = true;
714 gemmlowp::eight_bit_int_gemm::EightBitIntGemm(transpose_a, transpose_b, transpose_c,
715 m, n, k, a, -a_offset, lda,
Miao Wang99d0e812015-07-06 18:05:39 -0700716 b, -b_offset, ldb, c, c_offset,
Miao Wang9195e512015-09-14 15:50:08 -0700717 c_mult_int, c_shift, ldc,
718 gemmlowp::eight_bit_int_gemm::BitDepthSetting::A8B8);
Miao Wang223231f2015-07-16 15:31:33 -0700719
Tim Murrayaff74452015-03-30 15:14:36 -0700720}
721
722
723
724
Tim Murray64c682b2015-01-09 12:08:43 -0800725
726RsdCpuScriptIntrinsicBLAS::RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx,
727 const Script *s)
728 : RsdCpuScriptIntrinsic(ctx, s, nullptr, RS_SCRIPT_INTRINSIC_ID_BLAS) {
729
730
731}
732
733RsdCpuScriptIntrinsicBLAS::~RsdCpuScriptIntrinsicBLAS() {
734}
735
Tim Murray64c682b2015-01-09 12:08:43 -0800736RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
737 const Script *s, const Element *e) {
738
739 return new RsdCpuScriptIntrinsicBLAS(ctx, s);
740}
Chih-Hung Hsieh462de212016-11-16 11:33:57 -0800741
742} // namespace renderscript
743} // namespace android