blob: d165e1230b5756bc957f6a862369ac616f324b7f [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2000-2003 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26
27
28/*
29 * FUNCTION
30 * Internal functions for mlib_ImageConv* on U8 type
31 * and MLIB_EDGE_DST_NO_WRITE mask
32 *
33 */
34
35/***************************************************************/
36
37#include <vis_proto.h>
38#include <mlib_image.h>
39#include <mlib_ImageCheck.h>
40#include <mlib_ImageColormap.h>
41
42/*
43 This defines switches between functions in
44 files: mlib_v_ImageConv_8nw.c,
45 mlib_v_ImageConvIndex3_8_16nw.c,
46 mlib_v_ImageConvIndex4_8_16nw.c,
47 mlib_v_ImageConvIndex3_8_16nw.c,
48 mlib_v_ImageConvIndex4_8_16nw.c
49*/
50
51#define CONV_INDEX
52
53#define DTYPE mlib_s16
54#define LTYPE mlib_u8
55
56/***************************************************************/
57
58#ifdef CONV_INDEX
59
60#define CONV_FUNC(KERN) \
61 mlib_conv##KERN##_Index3_8_16nw(mlib_image *dst, \
62 mlib_image *src, \
63 mlib_s32 *kern, \
64 mlib_s32 scale, \
65 void *colormap)
66
67#else
68
69#define CONV_FUNC(KERN) \
70 mlib_conv##KERN##_8nw_f(mlib_image *dst, \
71 mlib_image *src, \
72 mlib_s32 *kern, \
73 mlib_s32 scale)
74
75#endif
76
77/***************************************************************/
78
79#ifdef CONV_INDEX
80
81#define NCHAN 3
82
83#else
84
85#define NCHAN nchan
86
87#endif
88
89/***************************************************************/
90
91#define DEF_VARS \
92 DTYPE *sl, *sp, *dl; \
93 mlib_s32 hgt = mlib_ImageGetHeight(src); \
94 mlib_s32 wid = mlib_ImageGetWidth(src); \
95 mlib_s32 sll = mlib_ImageGetStride(src) / sizeof(DTYPE); \
96 mlib_s32 dll = mlib_ImageGetStride(dst) / sizeof(DTYPE); \
97 DTYPE *adr_src = (DTYPE *)mlib_ImageGetData(src); \
98 DTYPE *adr_dst = (DTYPE *)mlib_ImageGetData(dst); \
99 mlib_s32 ssize, xsize, dsize, esize, emask, buff_ind = 0; \
100 mlib_d64 *pbuff, *dp; \
101 mlib_f32 *karr = (mlib_f32 *)kern; \
102 mlib_s32 gsr_scale = (31 - scale) << 3; \
103 mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]); \
104 mlib_s32 i, j, l
105
106/***************************************************************/
107
108#ifdef CONV_INDEX
109
110#define DEF_EXTRA_VARS \
111 int offset = mlib_ImageGetLutOffset(colormap); \
112 LTYPE **lut_table = (LTYPE**)mlib_ImageGetLutData(colormap); \
113 LTYPE *ltbl0 = lut_table[0] - offset; \
114 LTYPE *ltbl1 = lut_table[1] - offset; \
115 LTYPE *ltbl2 = lut_table[2] - offset; \
116 LTYPE *ltbl3 = (NCHAN > 3) ? lut_table[3] - offset : ltbl2
117
118#else
119
120#define DEF_EXTRA_VARS \
121 mlib_s32 nchan = mlib_ImageGetChannels(dst)
122
123#endif
124
125/***************************************************************/
126
127#if NCHAN == 3
128
129#define LOAD_SRC() { \
130 mlib_s32 s0 = sp[0], s1 = sp[1], s2 = sp[2], s3 = sp[3]; \
131 mlib_s32 s4 = sp[4], s5 = sp[5], s6 = sp[6], s7 = sp[7]; \
132 mlib_d64 t0, t1, t2; \
133 \
134 t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s7), t2); \
135 t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s7), t2); \
136 t2 = vis_faligndata(vis_ld_u8_i(ltbl0, s7), t2); \
137 t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s6), t2); \
138 t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s6), t2); \
139 t2 = vis_faligndata(vis_ld_u8_i(ltbl0, s6), t2); \
140 t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s5), t2); \
141 t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s5), t2); \
142 t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s5), t1); \
143 t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s4), t1); \
144 t1 = vis_faligndata(vis_ld_u8_i(ltbl1, s4), t1); \
145 t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s4), t1); \
146 t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s3), t1); \
147 t1 = vis_faligndata(vis_ld_u8_i(ltbl1, s3), t1); \
148 t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s3), t1); \
149 t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s2), t1); \
150 t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s2), t0); \
151 t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s2), t0); \
152 t0 = vis_faligndata(vis_ld_u8_i(ltbl2, s1), t0); \
153 t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s1), t0); \
154 t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s1), t0); \
155 t0 = vis_faligndata(vis_ld_u8_i(ltbl2, s0), t0); \
156 t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s0), t0); \
157 t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s0), t0); \
158 \
159 buffn[i] = t0; \
160 buffn[i + 1] = t1; \
161 buffn[i + 2] = t2; \
162 \
163 sp += 8; \
164 }
165
166#else
167
168#define LOAD_SRC() { \
169 mlib_s32 s0 = sp[0], s1 = sp[1], s2 = sp[2], s3 = sp[3]; \
170 mlib_s32 s4 = sp[4], s5 = sp[5], s6 = sp[6], s7 = sp[7]; \
171 mlib_d64 t0, t1, t2; \
172 \
173 t2 = vis_faligndata(vis_ld_u8_i(ltbl3, s5), t2); \
174 t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s5), t2); \
175 t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s5), t2); \
176 t2 = vis_faligndata(vis_ld_u8_i(ltbl0, s5), t2); \
177 t2 = vis_faligndata(vis_ld_u8_i(ltbl3, s4), t2); \
178 t2 = vis_faligndata(vis_ld_u8_i(ltbl2, s4), t2); \
179 t2 = vis_faligndata(vis_ld_u8_i(ltbl1, s4), t2); \
180 t2 = vis_faligndata(vis_ld_u8_i(ltbl0, s4), t2); \
181 t1 = vis_faligndata(vis_ld_u8_i(ltbl3, s3), t1); \
182 t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s3), t1); \
183 t1 = vis_faligndata(vis_ld_u8_i(ltbl1, s3), t1); \
184 t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s3), t1); \
185 t1 = vis_faligndata(vis_ld_u8_i(ltbl3, s2), t1); \
186 t1 = vis_faligndata(vis_ld_u8_i(ltbl2, s2), t1); \
187 t1 = vis_faligndata(vis_ld_u8_i(ltbl1, s2), t1); \
188 t1 = vis_faligndata(vis_ld_u8_i(ltbl0, s2), t1); \
189 t0 = vis_faligndata(vis_ld_u8_i(ltbl3, s1), t0); \
190 t0 = vis_faligndata(vis_ld_u8_i(ltbl2, s1), t0); \
191 t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s1), t0); \
192 t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s1), t0); \
193 t0 = vis_faligndata(vis_ld_u8_i(ltbl3, s0), t0); \
194 t0 = vis_faligndata(vis_ld_u8_i(ltbl2, s0), t0); \
195 t0 = vis_faligndata(vis_ld_u8_i(ltbl1, s0), t0); \
196 t0 = vis_faligndata(vis_ld_u8_i(ltbl0, s0), t0); \
197 \
198 buffn[i] = t0; \
199 buffn[i + 1] = t1; \
200 buffn[i + 2] = t2; \
201 \
202 sp += 6; \
203 }
204
205#endif
206
207/***************************************************************/
208
209static mlib_s32 mlib_round_8[16] = { 0x00400040, 0x00200020, 0x00100010, 0x00080008,
210 0x00040004, 0x00020002, 0x00010001, 0x00000000,
211 0x00000000, 0x00000000, 0x00000000, 0x00000000,
212 0x00000000, 0x00000000, 0x00000000, 0x00000000 };
213
214/***************************************************************/
215
216void mlib_ImageCopy_na(mlib_u8 *sa, mlib_u8 *da, int size);
217
218/***************************************************************/
219
220#define KSIZE 2
221
222mlib_status CONV_FUNC(2x2)
223{
224 mlib_d64 *buffs[2*(KSIZE + 1)];
225 mlib_d64 *buff0, *buff1, *buffn, *buffd, *buffe;
226 mlib_d64 s00, s01, s10, s11, s0, s1;
227 mlib_d64 d0, d1, d00, d01, d10, d11;
228 DEF_VARS;
229 DEF_EXTRA_VARS;
230
231 sl = adr_src;
232 dl = adr_dst;
233
234 ssize = NCHAN*wid;
235 dsize = (ssize + 7)/8;
236 esize = dsize + 4;
237 pbuff = mlib_malloc((KSIZE + 4)*esize*sizeof(mlib_d64));
238 if (pbuff == NULL) return MLIB_FAILURE;
239
240 for (i = 0; i < (KSIZE + 1); i++) buffs[i] = pbuff + i*esize;
241 for (i = 0; i < (KSIZE + 1); i++) buffs[(KSIZE + 1) + i] = buffs[i];
242 buffd = buffs[KSIZE] + esize;
243 buffe = buffd + 2*esize;
244
245 wid -= (KSIZE - 1);
246 hgt -= (KSIZE - 1);
247 xsize = ssize - NCHAN*(KSIZE - 1);
248 emask = (0xFF00 >> (xsize & 7)) & 0xFF;
249
250 vis_write_gsr(gsr_scale + 7);
251
252 for (l = 0; l < KSIZE; l++) {
253 mlib_d64 *buffn = buffs[l];
254 sp = sl + l*sll;
255
256#ifndef CONV_INDEX
257 if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
258
259#else
260#pragma pipeloop(0)
261 for (i = 0; i < dsize; i += 3) {
262 LOAD_SRC();
263 }
264#endif /* CONV_INDEX */
265 }
266
267 for (j = 0; j < hgt; j++) {
268 mlib_d64 **buffc = buffs + buff_ind;
269 mlib_f32 *pk = karr, k0, k1;
270 sp = sl + KSIZE*sll;
271
272 buff0 = buffc[0];
273 buff1 = buffc[1];
274 buffn = buffc[KSIZE];
275
276#ifndef CONV_INDEX
277 if ((((mlib_addr)(sl )) & 7) == 0) buff0 = (mlib_d64*)sl;
278 if ((((mlib_addr)(sl + sll)) & 7) == 0) buff1 = (mlib_d64*)(sl + sll);
279 if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
280#endif
281
282 k0 = pk[1];
283 k1 = pk[3];
284 vis_write_gsr(gsr_scale + NCHAN);
285
286 s01 = buff0[0];
287 s11 = buff1[0];
288#pragma pipeloop(0)
289 for (i = 0; i < (xsize + 7)/8; i++) {
290 s00 = s01;
291 s10 = s11;
292 s01 = buff0[i + 1];
293 s11 = buff1[i + 1];
294 s0 = vis_faligndata(s00, s01);
295 s1 = vis_faligndata(s10, s11);
296
297 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
298 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
299 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
300 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
301
302 d0 = vis_fpadd16(d00, d10);
303 d1 = vis_fpadd16(d01, d11);
304 buffd[2*i] = d0;
305 buffd[2*i + 1] = d1;
306 }
307
308 k0 = pk[0];
309 k1 = pk[2];
310#ifndef CONV_INDEX
311 dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
312
313#pragma pipeloop(0)
314 for (i = 0; i < xsize/8; i++) {
315 s0 = buff0[i];
316 s1 = buff1[i];
317
318 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
319 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
320 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
321 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
322
323 d0 = buffd[2*i];
324 d1 = buffd[2*i + 1];
325 d00 = vis_fpadd16(d00, d10);
326 d0 = vis_fpadd16(d0, drnd);
327 d0 = vis_fpadd16(d0, d00);
328 d01 = vis_fpadd16(d01, d11);
329 d1 = vis_fpadd16(d1, drnd);
330 d1 = vis_fpadd16(d1, d01);
331 dp[i] = vis_fpack16_pair(d0, d1);
332 }
333
334 if (emask) {
335 s0 = buff0[i];
336 s1 = buff1[i];
337
338 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
339 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
340 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
341 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
342
343 d0 = buffd[2*i];
344 d1 = buffd[2*i + 1];
345 d00 = vis_fpadd16(d00, d10);
346 d0 = vis_fpadd16(d0, drnd);
347 d0 = vis_fpadd16(d0, d00);
348 d01 = vis_fpadd16(d01, d11);
349 d1 = vis_fpadd16(d1, drnd);
350 d1 = vis_fpadd16(d1, d01);
351
352 d0 = vis_fpack16_pair(d0, d1);
353 vis_pst_8(d0, dp + i, emask);
354 }
355
356 if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
357
358#else
359 vis_write_gsr(gsr_scale + 7);
360
361#pragma pipeloop(0)
362 for (i = 0; i < dsize; i += 3) {
363 mlib_d64 d00, d01, d02, d03, d04, d05;
364 mlib_d64 d10, d11, d12, d13, d14, d15;
365 mlib_d64 d0, d1, d2, d3, d4, d5;
366 mlib_d64 s00 = buff0[i];
367 mlib_d64 s01 = buff0[i + 1];
368 mlib_d64 s02 = buff0[i + 2];
369 mlib_d64 s10 = buff1[i];
370 mlib_d64 s11 = buff1[i + 1];
371 mlib_d64 s12 = buff1[i + 2];
372
373 d00 = vis_fmul8x16au(vis_read_hi(s00), k0);
374 d01 = vis_fmul8x16au(vis_read_lo(s00), k0);
375 d02 = vis_fmul8x16au(vis_read_hi(s01), k0);
376 d03 = vis_fmul8x16au(vis_read_lo(s01), k0);
377 d04 = vis_fmul8x16au(vis_read_hi(s02), k0);
378 d05 = vis_fmul8x16au(vis_read_lo(s02), k0);
379 d10 = vis_fmul8x16au(vis_read_hi(s10), k1);
380 d11 = vis_fmul8x16au(vis_read_lo(s10), k1);
381 d12 = vis_fmul8x16au(vis_read_hi(s11), k1);
382 d13 = vis_fmul8x16au(vis_read_lo(s11), k1);
383 d14 = vis_fmul8x16au(vis_read_hi(s12), k1);
384 d15 = vis_fmul8x16au(vis_read_lo(s12), k1);
385
386 d0 = buffd[2*i];
387 d1 = buffd[2*i + 1];
388 d2 = buffd[2*i + 2];
389 d3 = buffd[2*i + 3];
390 d4 = buffd[2*i + 4];
391 d5 = buffd[2*i + 5];
392 d00 = vis_fpadd16(d00, d10);
393 d0 = vis_fpadd16(d0, drnd);
394 d0 = vis_fpadd16(d0, d00);
395 d01 = vis_fpadd16(d01, d11);
396 d1 = vis_fpadd16(d1, drnd);
397 d1 = vis_fpadd16(d1, d01);
398 d02 = vis_fpadd16(d02, d12);
399 d2 = vis_fpadd16(d2, drnd);
400 d2 = vis_fpadd16(d2, d02);
401 d03 = vis_fpadd16(d03, d13);
402 d3 = vis_fpadd16(d3, drnd);
403 d3 = vis_fpadd16(d3, d03);
404 d04 = vis_fpadd16(d04, d14);
405 d4 = vis_fpadd16(d4, drnd);
406 d4 = vis_fpadd16(d4, d04);
407 d05 = vis_fpadd16(d05, d15);
408 d5 = vis_fpadd16(d5, drnd);
409 d5 = vis_fpadd16(d5, d05);
410
411 buffe[i ] = vis_fpack16_pair(d0, d1);
412 buffe[i + 1] = vis_fpack16_pair(d2, d3);
413 buffe[i + 2] = vis_fpack16_pair(d4, d5);
414
415 LOAD_SRC();
416 }
417
418 mlib_ImageColorTrue2IndexLine_U8_S16_3((void*)buffe, dl, wid, colormap);
419#endif /* CONV_INDEX */
420
421 sl += sll;
422 dl += dll;
423
424 buff_ind++;
425 if (buff_ind >= (KSIZE + 1)) buff_ind = 0;
426 }
427
428 mlib_free(pbuff);
429
430 return MLIB_SUCCESS;
431}
432
433/***************************************************************/
434
435#undef KSIZE
436#define KSIZE 3
437
438mlib_status CONV_FUNC(3x3)
439{
440 mlib_d64 *buffs[2*(KSIZE + 1)];
441 mlib_d64 *buff0, *buff1, *buff2, *buffn, *buffd, *buffe;
442 mlib_d64 s00, s01, s10, s11, s20, s21, s0, s1, s2;
443 mlib_d64 dd, d0, d1, d00, d01, d10, d11, d20, d21;
444 mlib_s32 ik, ik_last, off, doff;
445 DEF_VARS;
446 DEF_EXTRA_VARS;
447
448 sl = adr_src;
449#ifdef CONV_INDEX
450 dl = adr_dst + ((KSIZE - 1)/2)*(dll + 1);
451#else
452 dl = adr_dst + ((KSIZE - 1)/2)*(dll + NCHAN);
453#endif
454
455 ssize = NCHAN*wid;
456 dsize = (ssize + 7)/8;
457 esize = dsize + 4;
458 pbuff = mlib_malloc((KSIZE + 4)*esize*sizeof(mlib_d64));
459 if (pbuff == NULL) return MLIB_FAILURE;
460
461 for (i = 0; i < (KSIZE + 1); i++) buffs[i] = pbuff + i*esize;
462 for (i = 0; i < (KSIZE + 1); i++) buffs[(KSIZE + 1) + i] = buffs[i];
463 buffd = buffs[KSIZE] + esize;
464 buffe = buffd + 2*esize;
465
466 wid -= (KSIZE - 1);
467 hgt -= (KSIZE - 1);
468 xsize = ssize - NCHAN*(KSIZE - 1);
469 emask = (0xFF00 >> (xsize & 7)) & 0xFF;
470
471 vis_write_gsr(gsr_scale + 7);
472
473 for (l = 0; l < KSIZE; l++) {
474 mlib_d64 *buffn = buffs[l];
475 sp = sl + l*sll;
476
477#ifndef CONV_INDEX
478 if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
479#else
480#pragma pipeloop(0)
481 for (i = 0; i < dsize; i += 3) {
482 LOAD_SRC();
483 }
484#endif /* CONV_INDEX */
485 }
486
487 /* init buffer */
488#pragma pipeloop(0)
489 for (i = 0; i < (xsize + 7)/8; i++) {
490 buffd[2*i ] = drnd;
491 buffd[2*i + 1] = drnd;
492 }
493
494 for (j = 0; j < hgt; j++) {
495 mlib_d64 **buffc = buffs + buff_ind, *pbuff0, *pbuff1, *pbuff2;
496 mlib_f32 *pk = karr, k0, k1, k2;
497 sp = sl + KSIZE*sll;
498
499 pbuff0 = buffc[0];
500 pbuff1 = buffc[1];
501 pbuff2 = buffc[2];
502 buffn = buffc[KSIZE];
503
504#ifndef CONV_INDEX
505 if ((((mlib_addr)(sl )) & 7) == 0) pbuff0 = (mlib_d64*)sl;
506 if ((((mlib_addr)(sl + sll)) & 7) == 0) pbuff1 = (mlib_d64*)(sl + sll);
507 if ((((mlib_addr)(sl + 2*sll)) & 7) == 0) pbuff2 = (mlib_d64*)(sl + 2*sll);
508
509 if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
510#endif
511
512#ifdef CONV_INDEX
513 ik_last = 0;
514#else
515 ik_last = (KSIZE - 1);
516#endif
517
518 for (ik = 0; ik < KSIZE; ik++) {
519 k0 = pk[ik];
520 k1 = pk[ik + KSIZE];
521 k2 = pk[ik + 2*KSIZE];
522
523 off = ik*NCHAN;
524 doff = off/8;
525 off &= 7;
526 buff0 = pbuff0 + doff;
527 buff1 = pbuff1 + doff;
528 buff2 = pbuff2 + doff;
529 vis_write_gsr(gsr_scale + off);
530
531 if (ik == ik_last) continue;
532 /*if (!ik_last) {
533 if ((off & 3) || (ik == (KSIZE - 1))) {
534 ik_last = ik;
535 continue;
536 }
537 }*/
538
539 if (off == 0) {
540#pragma pipeloop(0)
541 for (i = 0; i < (xsize + 7)/8; i++) {
542 s0 = buff0[i];
543 s1 = buff1[i];
544 s2 = buff2[i];
545
546 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
547 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
548 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
549 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
550 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
551 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
552
553 d0 = buffd[2*i];
554 d1 = buffd[2*i + 1];
555 d0 = vis_fpadd16(d00, d0);
556 d0 = vis_fpadd16(d10, d0);
557 d0 = vis_fpadd16(d20, d0);
558 d1 = vis_fpadd16(d01, d1);
559 d1 = vis_fpadd16(d11, d1);
560 d1 = vis_fpadd16(d21, d1);
561 buffd[2*i] = d0;
562 buffd[2*i + 1] = d1;
563 }
564
565 } else if (off == 4) {
566 s01 = buff0[0];
567 s11 = buff1[0];
568 s21 = buff2[0];
569#pragma pipeloop(0)
570 for (i = 0; i < (xsize + 7)/8; i++) {
571 s00 = s01;
572 s10 = s11;
573 s20 = s21;
574 s01 = buff0[i + 1];
575 s11 = buff1[i + 1];
576 s21 = buff2[i + 1];
577
578 d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
579 d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
580 d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
581 d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
582 d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
583 d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
584
585 d0 = buffd[2*i];
586 d1 = buffd[2*i + 1];
587 d0 = vis_fpadd16(d00, d0);
588 d0 = vis_fpadd16(d10, d0);
589 d0 = vis_fpadd16(d20, d0);
590 d1 = vis_fpadd16(d01, d1);
591 d1 = vis_fpadd16(d11, d1);
592 d1 = vis_fpadd16(d21, d1);
593 buffd[2*i] = d0;
594 buffd[2*i + 1] = d1;
595 }
596
597 } else {
598 s01 = buff0[0];
599 s11 = buff1[0];
600 s21 = buff2[0];
601#pragma pipeloop(0)
602 for (i = 0; i < (xsize + 7)/8; i++) {
603 s00 = s01;
604 s10 = s11;
605 s20 = s21;
606 s01 = buff0[i + 1];
607 s11 = buff1[i + 1];
608 s21 = buff2[i + 1];
609 s0 = vis_faligndata(s00, s01);
610 s1 = vis_faligndata(s10, s11);
611 s2 = vis_faligndata(s20, s21);
612
613 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
614 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
615 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
616 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
617 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
618 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
619
620 d0 = buffd[2*i];
621 d1 = buffd[2*i + 1];
622 d0 = vis_fpadd16(d00, d0);
623 d0 = vis_fpadd16(d10, d0);
624 d0 = vis_fpadd16(d20, d0);
625 d1 = vis_fpadd16(d01, d1);
626 d1 = vis_fpadd16(d11, d1);
627 d1 = vis_fpadd16(d21, d1);
628 buffd[2*i] = d0;
629 buffd[2*i + 1] = d1;
630 }
631 }
632 }
633
634 k0 = pk[ik_last];
635 k1 = pk[ik_last + KSIZE];
636 k2 = pk[ik_last + 2*KSIZE];
637
638 off = ik_last*NCHAN;
639 doff = off/8;
640 off &= 7;
641 buff0 = pbuff0 + doff;
642 buff1 = pbuff1 + doff;
643 buff2 = pbuff2 + doff;
644 vis_write_gsr(gsr_scale + off);
645
646#ifndef CONV_INDEX
647 dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
648
649 s01 = buff0[0];
650 s11 = buff1[0];
651 s21 = buff2[0];
652#pragma pipeloop(0)
653 for (i = 0; i < xsize/8; i++) {
654 s00 = s01;
655 s10 = s11;
656 s20 = s21;
657 s01 = buff0[i + 1];
658 s11 = buff1[i + 1];
659 s21 = buff2[i + 1];
660 s0 = vis_faligndata(s00, s01);
661 s1 = vis_faligndata(s10, s11);
662 s2 = vis_faligndata(s20, s21);
663
664 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
665 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
666 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
667 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
668 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
669 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
670
671 d0 = buffd[2*i];
672 d1 = buffd[2*i + 1];
673 d0 = vis_fpadd16(d0, d00);
674 d0 = vis_fpadd16(d0, d10);
675 d0 = vis_fpadd16(d0, d20);
676 d1 = vis_fpadd16(d1, d01);
677 d1 = vis_fpadd16(d1, d11);
678 d1 = vis_fpadd16(d1, d21);
679
680 dd = vis_fpack16_pair(d0, d1);
681 dp[i] = dd;
682
683 buffd[2*i ] = drnd;
684 buffd[2*i + 1] = drnd;
685 }
686
687 if (emask) {
688 s00 = s01;
689 s10 = s11;
690 s20 = s21;
691 s01 = buff0[i + 1];
692 s11 = buff1[i + 1];
693 s21 = buff2[i + 1];
694 s0 = vis_faligndata(s00, s01);
695 s1 = vis_faligndata(s10, s11);
696 s2 = vis_faligndata(s20, s21);
697
698 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
699 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
700 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
701 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
702 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
703 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
704
705 d0 = buffd[2*i];
706 d1 = buffd[2*i + 1];
707 d0 = vis_fpadd16(d0, d00);
708 d0 = vis_fpadd16(d0, d10);
709 d0 = vis_fpadd16(d0, d20);
710 d1 = vis_fpadd16(d1, d01);
711 d1 = vis_fpadd16(d1, d11);
712 d1 = vis_fpadd16(d1, d21);
713
714 dd = vis_fpack16_pair(d0, d1);
715 vis_pst_8(dd, dp + i, emask);
716
717 buffd[2*i ] = drnd;
718 buffd[2*i + 1] = drnd;
719 }
720
721 if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
722
723#else
724 vis_write_gsr(gsr_scale + 7);
725
726#pragma pipeloop(0)
727 for (i = 0; i < dsize; i += 3) {
728 mlib_d64 d00, d01, d02, d03, d04, d05;
729 mlib_d64 d10, d11, d12, d13, d14, d15;
730 mlib_d64 d20, d21, d22, d23, d24, d25;
731 mlib_d64 d0, d1, d2, d3, d4, d5;
732 mlib_d64 s00 = buff0[i];
733 mlib_d64 s01 = buff0[i + 1];
734 mlib_d64 s02 = buff0[i + 2];
735 mlib_d64 s10 = buff1[i];
736 mlib_d64 s11 = buff1[i + 1];
737 mlib_d64 s12 = buff1[i + 2];
738 mlib_d64 s20 = buff2[i];
739 mlib_d64 s21 = buff2[i + 1];
740 mlib_d64 s22 = buff2[i + 2];
741
742 d00 = vis_fmul8x16au(vis_read_hi(s00), k0);
743 d01 = vis_fmul8x16au(vis_read_lo(s00), k0);
744 d02 = vis_fmul8x16au(vis_read_hi(s01), k0);
745 d03 = vis_fmul8x16au(vis_read_lo(s01), k0);
746 d04 = vis_fmul8x16au(vis_read_hi(s02), k0);
747 d05 = vis_fmul8x16au(vis_read_lo(s02), k0);
748 d10 = vis_fmul8x16au(vis_read_hi(s10), k1);
749 d11 = vis_fmul8x16au(vis_read_lo(s10), k1);
750 d12 = vis_fmul8x16au(vis_read_hi(s11), k1);
751 d13 = vis_fmul8x16au(vis_read_lo(s11), k1);
752 d14 = vis_fmul8x16au(vis_read_hi(s12), k1);
753 d15 = vis_fmul8x16au(vis_read_lo(s12), k1);
754 d20 = vis_fmul8x16au(vis_read_hi(s20), k2);
755 d21 = vis_fmul8x16au(vis_read_lo(s20), k2);
756 d22 = vis_fmul8x16au(vis_read_hi(s21), k2);
757 d23 = vis_fmul8x16au(vis_read_lo(s21), k2);
758 d24 = vis_fmul8x16au(vis_read_hi(s22), k2);
759 d25 = vis_fmul8x16au(vis_read_lo(s22), k2);
760
761 d0 = buffd[2*i];
762 d1 = buffd[2*i + 1];
763 d2 = buffd[2*i + 2];
764 d3 = buffd[2*i + 3];
765 d4 = buffd[2*i + 4];
766 d5 = buffd[2*i + 5];
767 d0 = vis_fpadd16(d0, d00);
768 d0 = vis_fpadd16(d0, d10);
769 d0 = vis_fpadd16(d0, d20);
770 d1 = vis_fpadd16(d1, d01);
771 d1 = vis_fpadd16(d1, d11);
772 d1 = vis_fpadd16(d1, d21);
773 d2 = vis_fpadd16(d2, d02);
774 d2 = vis_fpadd16(d2, d12);
775 d2 = vis_fpadd16(d2, d22);
776 d3 = vis_fpadd16(d3, d03);
777 d3 = vis_fpadd16(d3, d13);
778 d3 = vis_fpadd16(d3, d23);
779 d4 = vis_fpadd16(d4, d04);
780 d4 = vis_fpadd16(d4, d14);
781 d4 = vis_fpadd16(d4, d24);
782 d5 = vis_fpadd16(d5, d05);
783 d5 = vis_fpadd16(d5, d15);
784 d5 = vis_fpadd16(d5, d25);
785
786 buffe[i ] = vis_fpack16_pair(d0, d1);
787 buffe[i + 1] = vis_fpack16_pair(d2, d3);
788 buffe[i + 2] = vis_fpack16_pair(d4, d5);
789
790 buffd[2*i ] = drnd;
791 buffd[2*i + 1] = drnd;
792 buffd[2*i + 2] = drnd;
793 buffd[2*i + 3] = drnd;
794 buffd[2*i + 4] = drnd;
795 buffd[2*i + 5] = drnd;
796
797 LOAD_SRC();
798 }
799
800 mlib_ImageColorTrue2IndexLine_U8_S16_3((void*)buffe, dl, wid, colormap);
801#endif /* CONV_INDEX */
802
803 sl += sll;
804 dl += dll;
805
806 buff_ind++;
807 if (buff_ind >= (KSIZE + 1)) buff_ind = 0;
808 }
809
810 mlib_free(pbuff);
811
812 return MLIB_SUCCESS;
813}
814
815/***************************************************************/
816
817#undef KSIZE
818#define MAX_N 11
819
820#ifdef CONV_INDEX
821
822mlib_status mlib_convMxN_Index3_8_16nw(mlib_image *dst,
823 mlib_image *src,
824 mlib_s32 m,
825 mlib_s32 n,
826 mlib_s32 dm,
827 mlib_s32 dn,
828 mlib_s32 *kern,
829 mlib_s32 scale,
830 void *colormap)
831
832#else
833
834mlib_status mlib_convMxN_8nw_f(mlib_image *dst,
835 mlib_image *src,
836 mlib_s32 m,
837 mlib_s32 n,
838 mlib_s32 dm,
839 mlib_s32 dn,
840 mlib_s32 *kern,
841 mlib_s32 scale)
842
843#endif
844{
845 mlib_d64 *buffs_local[3*(MAX_N + 1)], **buffs = buffs_local, **buff;
846 mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
847 mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
848 mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
849 mlib_d64 dd, d0, d1;
850 mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
851 DEF_VARS;
852 DEF_EXTRA_VARS;
853
854 if (n > MAX_N) {
855 buffs = mlib_malloc(3*(n + 1)*sizeof(mlib_d64*));
856 if (buffs == NULL) return MLIB_FAILURE;
857 }
858
859 buff = buffs + 2*(n + 1);
860
861 sl = adr_src;
862#ifdef CONV_INDEX
863 dl = adr_dst + dn*dll + dm;
864#else
865 dl = adr_dst + dn*dll + dm*NCHAN;
866#endif
867
868 ssize = NCHAN*wid;
869 dsize = (ssize + 7)/8;
870 esize = dsize + 4;
871 pbuff = mlib_malloc((n + 4)*esize*sizeof(mlib_d64));
872 if (pbuff == NULL) {
873 if (buffs != buffs_local) mlib_free(buffs);
874 return MLIB_FAILURE;
875 }
876
877 for (i = 0; i < (n + 1); i++) buffs[i] = pbuff + i*esize;
878 for (i = 0; i < (n + 1); i++) buffs[(n + 1) + i] = buffs[i];
879 buffd = buffs[n] + esize;
880 buffe = buffd + 2*esize;
881
882 wid -= (m - 1);
883 hgt -= (n - 1);
884 xsize = ssize - NCHAN*(m - 1);
885 emask = (0xFF00 >> (xsize & 7)) & 0xFF;
886
887 vis_write_gsr(gsr_scale + 7);
888
889 for (l = 0; l < n; l++) {
890 mlib_d64 *buffn = buffs[l];
891 sp = sl + l*sll;
892
893#ifndef CONV_INDEX
894 if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
895#else
896#pragma pipeloop(0)
897 for (i = 0; i < dsize; i += 3) {
898 LOAD_SRC();
899 }
900#endif /* CONV_INDEX */
901 }
902
903 /* init buffer */
904#pragma pipeloop(0)
905 for (i = 0; i < (xsize + 7)/8; i++) {
906 buffd[2*i ] = drnd;
907 buffd[2*i + 1] = drnd;
908 }
909
910 for (j = 0; j < hgt; j++) {
911 mlib_d64 **buffc = buffs + buff_ind;
912 mlib_f32 *pk = karr, k0, k1, k2, k3;
913 sp = sl + n*sll;
914
915 for (l = 0; l < n; l++) {
916 buff[l] = buffc[l];
917 }
918 buffn = buffc[n];
919
920#ifndef CONV_INDEX
921 for (l = 0; l < n; l++) {
922 if ((((mlib_addr)(sl + l*sll)) & 7) == 0) buff[l] = (mlib_d64*)(sl + l*sll);
923 }
924 if ((mlib_addr)sp & 7) mlib_ImageCopy_na((void*)sp, (void*)buffn, ssize);
925#endif
926
927#ifdef CONV_INDEX
928 ik_last = 0;
929#else
930 ik_last = (m - 1);
931#endif
932
933 for (jk = 0; jk < n; jk += jk_size) {
934 jk_size = n - jk;
935#ifdef CONV_INDEX
936 if (jk_size >= 5) jk_size = 3;
937 if (jk_size == 4) jk_size = 2;
938#else
939 if (jk_size >= 6) jk_size = 4;
940 if (jk_size == 5) jk_size = 3;
941#endif
942 coff = 0;
943
944 if (jk_size == 2) {
945
946 for (ik = 0; ik < m; ik++, coff += NCHAN) {
947 if (!jk && ik == ik_last) continue;
948
949 k0 = pk[ik];
950 k1 = pk[ik + m];
951
952 doff = coff/8;
953 buff0 = buff[jk ] + doff;
954 buff1 = buff[jk + 1] + doff;
955
956 off = coff & 7;
957 vis_write_gsr(gsr_scale + off);
958
959 s01 = buff0[0];
960 s11 = buff1[0];
961#pragma pipeloop(0)
962 for (i = 0; i < (xsize + 7)/8; i++) {
963 s00 = s01;
964 s10 = s11;
965 s01 = buff0[i + 1];
966 s11 = buff1[i + 1];
967 s0 = vis_faligndata(s00, s01);
968 s1 = vis_faligndata(s10, s11);
969
970 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
971 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
972 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
973 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
974
975 d0 = buffd[2*i];
976 d1 = buffd[2*i + 1];
977 d0 = vis_fpadd16(d00, d0);
978 d0 = vis_fpadd16(d10, d0);
979 d1 = vis_fpadd16(d01, d1);
980 d1 = vis_fpadd16(d11, d1);
981 buffd[2*i] = d0;
982 buffd[2*i + 1] = d1;
983 }
984
985 }
986
987 pk += 2*m;
988
989 } else if (jk_size == 3) {
990
991 for (ik = 0; ik < m; ik++, coff += NCHAN) {
992 if (!jk && ik == ik_last) continue;
993
994 k0 = pk[ik];
995 k1 = pk[ik + m];
996 k2 = pk[ik + 2*m];
997
998 doff = coff/8;
999 buff0 = buff[jk ] + doff;
1000 buff1 = buff[jk + 1] + doff;
1001 buff2 = buff[jk + 2] + doff;
1002
1003 off = coff & 7;
1004 vis_write_gsr(gsr_scale + off);
1005
1006 if (off == 0) {
1007#pragma pipeloop(0)
1008 for (i = 0; i < (xsize + 7)/8; i++) {
1009 d0 = buffd[2*i];
1010 d1 = buffd[2*i + 1];
1011
1012 s0 = buff0[i];
1013 s1 = buff1[i];
1014 s2 = buff2[i];
1015
1016 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1017 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1018 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1019 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1020 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1021 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1022
1023 d00 = vis_fpadd16(d00, d10);
1024 d0 = vis_fpadd16(d20, d0);
1025 d0 = vis_fpadd16(d00, d0);
1026 d01 = vis_fpadd16(d01, d11);
1027 d1 = vis_fpadd16(d21, d1);
1028 d1 = vis_fpadd16(d01, d1);
1029 buffd[2*i] = d0;
1030 buffd[2*i + 1] = d1;
1031 }
1032
1033 } else if (off == 4) {
1034 s01 = buff0[0];
1035 s11 = buff1[0];
1036 s21 = buff2[0];
1037#pragma pipeloop(0)
1038 for (i = 0; i < (xsize + 7)/8; i++) {
1039 d0 = buffd[2*i];
1040 d1 = buffd[2*i + 1];
1041
1042 s00 = s01;
1043 s10 = s11;
1044 s20 = s21;
1045 s01 = buff0[i + 1];
1046 s11 = buff1[i + 1];
1047 s21 = buff2[i + 1];
1048
1049 d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
1050 d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
1051 d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
1052 d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
1053 d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
1054 d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
1055
1056 d00 = vis_fpadd16(d00, d10);
1057 d0 = vis_fpadd16(d20, d0);
1058 d0 = vis_fpadd16(d00, d0);
1059 d01 = vis_fpadd16(d01, d11);
1060 d1 = vis_fpadd16(d21, d1);
1061 d1 = vis_fpadd16(d01, d1);
1062 buffd[2*i] = d0;
1063 buffd[2*i + 1] = d1;
1064 }
1065
1066 } else {
1067 s01 = buff0[0];
1068 s11 = buff1[0];
1069 s21 = buff2[0];
1070#pragma pipeloop(0)
1071 for (i = 0; i < (xsize + 7)/8; i++) {
1072 d0 = buffd[2*i];
1073 d1 = buffd[2*i + 1];
1074
1075 s00 = s01;
1076 s10 = s11;
1077 s20 = s21;
1078 s01 = buff0[i + 1];
1079 s11 = buff1[i + 1];
1080 s21 = buff2[i + 1];
1081 s0 = vis_faligndata(s00, s01);
1082 s1 = vis_faligndata(s10, s11);
1083 s2 = vis_faligndata(s20, s21);
1084
1085 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1086 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1087 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1088 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1089 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1090 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1091
1092 d00 = vis_fpadd16(d00, d10);
1093 d0 = vis_fpadd16(d20, d0);
1094 d0 = vis_fpadd16(d00, d0);
1095 d01 = vis_fpadd16(d01, d11);
1096 d1 = vis_fpadd16(d21, d1);
1097 d1 = vis_fpadd16(d01, d1);
1098 buffd[2*i] = d0;
1099 buffd[2*i + 1] = d1;
1100 }
1101 }
1102 }
1103
1104 pk += 3*m;
1105
1106 } else { /* jk_size == 4 */
1107
1108 for (ik = 0; ik < m; ik++, coff += NCHAN) {
1109 if (!jk && ik == ik_last) continue;
1110
1111 k0 = pk[ik];
1112 k1 = pk[ik + m];
1113 k2 = pk[ik + 2*m];
1114 k3 = pk[ik + 3*m];
1115
1116 doff = coff/8;
1117 buff0 = buff[jk ] + doff;
1118 buff1 = buff[jk + 1] + doff;
1119 buff2 = buff[jk + 2] + doff;
1120 buff3 = buff[jk + 3] + doff;
1121
1122 off = coff & 7;
1123 vis_write_gsr(gsr_scale + off);
1124
1125 if (off == 0) {
1126
1127#pragma pipeloop(0)
1128 for (i = 0; i < (xsize + 7)/8; i++) {
1129 d0 = buffd[2*i];
1130 d1 = buffd[2*i + 1];
1131
1132 s0 = buff0[i];
1133 s1 = buff1[i];
1134 s2 = buff2[i];
1135 s3 = buff3[i];
1136
1137 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1138 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1139 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1140 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1141 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1142 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1143 d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
1144 d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
1145
1146 d00 = vis_fpadd16(d00, d10);
1147 d20 = vis_fpadd16(d20, d30);
1148 d0 = vis_fpadd16(d0, d00);
1149 d0 = vis_fpadd16(d0, d20);
1150 d01 = vis_fpadd16(d01, d11);
1151 d21 = vis_fpadd16(d21, d31);
1152 d1 = vis_fpadd16(d1, d01);
1153 d1 = vis_fpadd16(d1, d21);
1154 buffd[2*i] = d0;
1155 buffd[2*i + 1] = d1;
1156 }
1157
1158 } else if (off == 4) {
1159
1160 s01 = buff0[0];
1161 s11 = buff1[0];
1162 s21 = buff2[0];
1163 s31 = buff3[0];
1164#pragma pipeloop(0)
1165 for (i = 0; i < (xsize + 7)/8; i++) {
1166 d0 = buffd[2*i];
1167 d1 = buffd[2*i + 1];
1168
1169 s00 = s01;
1170 s10 = s11;
1171 s20 = s21;
1172 s30 = s31;
1173 s01 = buff0[i + 1];
1174 s11 = buff1[i + 1];
1175 s21 = buff2[i + 1];
1176 s31 = buff3[i + 1];
1177
1178 d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
1179 d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
1180 d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
1181 d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
1182 d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
1183 d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
1184 d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
1185 d31 = vis_fmul8x16au(vis_read_hi(s31), k3);
1186
1187 d00 = vis_fpadd16(d00, d10);
1188 d20 = vis_fpadd16(d20, d30);
1189 d0 = vis_fpadd16(d0, d00);
1190 d0 = vis_fpadd16(d0, d20);
1191 d01 = vis_fpadd16(d01, d11);
1192 d21 = vis_fpadd16(d21, d31);
1193 d1 = vis_fpadd16(d1, d01);
1194 d1 = vis_fpadd16(d1, d21);
1195 buffd[2*i] = d0;
1196 buffd[2*i + 1] = d1;
1197 }
1198
1199 } else {
1200
1201 s01 = buff0[0];
1202 s11 = buff1[0];
1203 s21 = buff2[0];
1204 s31 = buff3[0];
1205#pragma pipeloop(0)
1206 for (i = 0; i < (xsize + 7)/8; i++) {
1207 d0 = buffd[2*i];
1208 d1 = buffd[2*i + 1];
1209
1210 s00 = s01;
1211 s10 = s11;
1212 s20 = s21;
1213 s30 = s31;
1214 s01 = buff0[i + 1];
1215 s11 = buff1[i + 1];
1216 s21 = buff2[i + 1];
1217 s31 = buff3[i + 1];
1218 s0 = vis_faligndata(s00, s01);
1219 s1 = vis_faligndata(s10, s11);
1220 s2 = vis_faligndata(s20, s21);
1221 s3 = vis_faligndata(s30, s31);
1222
1223 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1224 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1225 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1226 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1227 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1228 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1229 d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
1230 d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
1231
1232 d00 = vis_fpadd16(d00, d10);
1233 d20 = vis_fpadd16(d20, d30);
1234 d0 = vis_fpadd16(d0, d00);
1235 d0 = vis_fpadd16(d0, d20);
1236 d01 = vis_fpadd16(d01, d11);
1237 d21 = vis_fpadd16(d21, d31);
1238 d1 = vis_fpadd16(d1, d01);
1239 d1 = vis_fpadd16(d1, d21);
1240 buffd[2*i] = d0;
1241 buffd[2*i + 1] = d1;
1242 }
1243 }
1244 }
1245
1246 pk += 4*m;
1247 }
1248 }
1249
1250 /*****************************************
1251 *****************************************
1252 ** Final iteration **
1253 *****************************************
1254 *****************************************/
1255
1256 jk_size = n;
1257#ifdef CONV_INDEX
1258 if (jk_size >= 5) jk_size = 3;
1259 if (jk_size == 4) jk_size = 2;
1260#else
1261 if (jk_size >= 6) jk_size = 4;
1262 if (jk_size == 5) jk_size = 3;
1263#endif
1264
1265 k0 = karr[ik_last];
1266 k1 = karr[ik_last + m];
1267 k2 = karr[ik_last + 2*m];
1268 k3 = karr[ik_last + 3*m];
1269
1270 off = ik_last*NCHAN;
1271 doff = off/8;
1272 off &= 7;
1273 buff0 = buff[0] + doff;
1274 buff1 = buff[1] + doff;
1275 buff2 = buff[2] + doff;
1276 buff3 = buff[3] + doff;
1277 vis_write_gsr(gsr_scale + off);
1278
1279#ifndef CONV_INDEX
1280 if (jk_size == 2) {
1281 dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
1282
1283 s01 = buff0[0];
1284 s11 = buff1[0];
1285#pragma pipeloop(0)
1286 for (i = 0; i < xsize/8; i++) {
1287 s00 = s01;
1288 s10 = s11;
1289 s01 = buff0[i + 1];
1290 s11 = buff1[i + 1];
1291 s0 = vis_faligndata(s00, s01);
1292 s1 = vis_faligndata(s10, s11);
1293
1294 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1295 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1296 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1297 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1298
1299 d0 = buffd[2*i];
1300 d1 = buffd[2*i + 1];
1301 d0 = vis_fpadd16(d0, d00);
1302 d0 = vis_fpadd16(d0, d10);
1303 d1 = vis_fpadd16(d1, d01);
1304 d1 = vis_fpadd16(d1, d11);
1305
1306 dd = vis_fpack16_pair(d0, d1);
1307 dp[i] = dd;
1308
1309 buffd[2*i ] = drnd;
1310 buffd[2*i + 1] = drnd;
1311 }
1312
1313 if (emask) {
1314 s00 = s01;
1315 s10 = s11;
1316 s01 = buff0[i + 1];
1317 s11 = buff1[i + 1];
1318 s0 = vis_faligndata(s00, s01);
1319 s1 = vis_faligndata(s10, s11);
1320
1321 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1322 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1323 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1324 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1325
1326 d0 = buffd[2*i];
1327 d1 = buffd[2*i + 1];
1328 d0 = vis_fpadd16(d0, d00);
1329 d0 = vis_fpadd16(d0, d10);
1330 d1 = vis_fpadd16(d1, d01);
1331 d1 = vis_fpadd16(d1, d11);
1332
1333 dd = vis_fpack16_pair(d0, d1);
1334 vis_pst_8(dd, dp + i, emask);
1335
1336 buffd[2*i ] = drnd;
1337 buffd[2*i + 1] = drnd;
1338 }
1339
1340 if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
1341
1342 } else if (jk_size == 3) {
1343
1344 dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
1345
1346 s01 = buff0[0];
1347 s11 = buff1[0];
1348 s21 = buff2[0];
1349#pragma pipeloop(0)
1350 for (i = 0; i < xsize/8; i++) {
1351 s00 = s01;
1352 s10 = s11;
1353 s20 = s21;
1354 s01 = buff0[i + 1];
1355 s11 = buff1[i + 1];
1356 s21 = buff2[i + 1];
1357 s0 = vis_faligndata(s00, s01);
1358 s1 = vis_faligndata(s10, s11);
1359 s2 = vis_faligndata(s20, s21);
1360
1361 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1362 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1363 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1364 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1365 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1366 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1367
1368 d0 = buffd[2*i];
1369 d1 = buffd[2*i + 1];
1370 d0 = vis_fpadd16(d0, d00);
1371 d0 = vis_fpadd16(d0, d10);
1372 d0 = vis_fpadd16(d0, d20);
1373 d1 = vis_fpadd16(d1, d01);
1374 d1 = vis_fpadd16(d1, d11);
1375 d1 = vis_fpadd16(d1, d21);
1376
1377 dd = vis_fpack16_pair(d0, d1);
1378 dp[i] = dd;
1379
1380 buffd[2*i ] = drnd;
1381 buffd[2*i + 1] = drnd;
1382 }
1383
1384 if (emask) {
1385 s00 = s01;
1386 s10 = s11;
1387 s20 = s21;
1388 s01 = buff0[i + 1];
1389 s11 = buff1[i + 1];
1390 s21 = buff2[i + 1];
1391 s0 = vis_faligndata(s00, s01);
1392 s1 = vis_faligndata(s10, s11);
1393 s2 = vis_faligndata(s20, s21);
1394
1395 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1396 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1397 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1398 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1399 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1400 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1401
1402 d0 = buffd[2*i];
1403 d1 = buffd[2*i + 1];
1404 d0 = vis_fpadd16(d0, d00);
1405 d0 = vis_fpadd16(d0, d10);
1406 d0 = vis_fpadd16(d0, d20);
1407 d1 = vis_fpadd16(d1, d01);
1408 d1 = vis_fpadd16(d1, d11);
1409 d1 = vis_fpadd16(d1, d21);
1410
1411 dd = vis_fpack16_pair(d0, d1);
1412 vis_pst_8(dd, dp + i, emask);
1413
1414 buffd[2*i ] = drnd;
1415 buffd[2*i + 1] = drnd;
1416 }
1417
1418 if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
1419
1420 } else /* if (jk_size == 4) */ {
1421
1422 dp = ((mlib_addr)dl & 7) ? buffe : (mlib_d64*)dl;
1423
1424 s01 = buff0[0];
1425 s11 = buff1[0];
1426 s21 = buff2[0];
1427 s31 = buff3[0];
1428#pragma pipeloop(0)
1429 for (i = 0; i < xsize/8; i++) {
1430 s00 = s01;
1431 s10 = s11;
1432 s20 = s21;
1433 s30 = s31;
1434 s01 = buff0[i + 1];
1435 s11 = buff1[i + 1];
1436 s21 = buff2[i + 1];
1437 s31 = buff3[i + 1];
1438 s0 = vis_faligndata(s00, s01);
1439 s1 = vis_faligndata(s10, s11);
1440 s2 = vis_faligndata(s20, s21);
1441 s3 = vis_faligndata(s30, s31);
1442
1443 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1444 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1445 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1446 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1447 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1448 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1449 d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
1450 d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
1451
1452 d0 = buffd[2*i];
1453 d1 = buffd[2*i + 1];
1454 d0 = vis_fpadd16(d0, d00);
1455 d0 = vis_fpadd16(d0, d10);
1456 d0 = vis_fpadd16(d0, d20);
1457 d0 = vis_fpadd16(d0, d30);
1458 d1 = vis_fpadd16(d1, d01);
1459 d1 = vis_fpadd16(d1, d11);
1460 d1 = vis_fpadd16(d1, d21);
1461 d1 = vis_fpadd16(d1, d31);
1462
1463 dd = vis_fpack16_pair(d0, d1);
1464 dp[i] = dd;
1465
1466 buffd[2*i ] = drnd;
1467 buffd[2*i + 1] = drnd;
1468 }
1469
1470 if (emask) {
1471 s00 = s01;
1472 s10 = s11;
1473 s20 = s21;
1474 s30 = s31;
1475 s01 = buff0[i + 1];
1476 s11 = buff1[i + 1];
1477 s21 = buff2[i + 1];
1478 s31 = buff3[i + 1];
1479 s0 = vis_faligndata(s00, s01);
1480 s1 = vis_faligndata(s10, s11);
1481 s2 = vis_faligndata(s20, s21);
1482 s3 = vis_faligndata(s30, s31);
1483
1484 d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1485 d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1486 d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1487 d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1488 d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1489 d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1490 d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
1491 d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
1492
1493 d0 = buffd[2*i];
1494 d1 = buffd[2*i + 1];
1495 d0 = vis_fpadd16(d0, d00);
1496 d0 = vis_fpadd16(d0, d10);
1497 d0 = vis_fpadd16(d0, d20);
1498 d0 = vis_fpadd16(d0, d30);
1499 d1 = vis_fpadd16(d1, d01);
1500 d1 = vis_fpadd16(d1, d11);
1501 d1 = vis_fpadd16(d1, d21);
1502 d1 = vis_fpadd16(d1, d31);
1503
1504 dd = vis_fpack16_pair(d0, d1);
1505 vis_pst_8(dd, dp + i, emask);
1506
1507 buffd[2*i ] = drnd;
1508 buffd[2*i + 1] = drnd;
1509 }
1510
1511 if ((mlib_u8*)dp != dl) mlib_ImageCopy_na((void*)buffe, dl, xsize);
1512 }
1513
1514#else /* CONV_INDEX */
1515
1516 if (jk_size == 2) {
1517 vis_write_gsr(gsr_scale + 7);
1518
1519#pragma pipeloop(0)
1520 for (i = 0; i < dsize; i += 3) {
1521 mlib_d64 d00, d01, d02, d03, d04, d05;
1522 mlib_d64 d10, d11, d12, d13, d14, d15;
1523 mlib_d64 d0, d1, d2, d3, d4, d5;
1524 mlib_d64 s00 = buff0[i];
1525 mlib_d64 s01 = buff0[i + 1];
1526 mlib_d64 s02 = buff0[i + 2];
1527 mlib_d64 s10 = buff1[i];
1528 mlib_d64 s11 = buff1[i + 1];
1529 mlib_d64 s12 = buff1[i + 2];
1530
1531 d00 = vis_fmul8x16au(vis_read_hi(s00), k0);
1532 d01 = vis_fmul8x16au(vis_read_lo(s00), k0);
1533 d02 = vis_fmul8x16au(vis_read_hi(s01), k0);
1534 d03 = vis_fmul8x16au(vis_read_lo(s01), k0);
1535 d04 = vis_fmul8x16au(vis_read_hi(s02), k0);
1536 d05 = vis_fmul8x16au(vis_read_lo(s02), k0);
1537 d10 = vis_fmul8x16au(vis_read_hi(s10), k1);
1538 d11 = vis_fmul8x16au(vis_read_lo(s10), k1);
1539 d12 = vis_fmul8x16au(vis_read_hi(s11), k1);
1540 d13 = vis_fmul8x16au(vis_read_lo(s11), k1);
1541 d14 = vis_fmul8x16au(vis_read_hi(s12), k1);
1542 d15 = vis_fmul8x16au(vis_read_lo(s12), k1);
1543
1544 d0 = buffd[2*i];
1545 d1 = buffd[2*i + 1];
1546 d2 = buffd[2*i + 2];
1547 d3 = buffd[2*i + 3];
1548 d4 = buffd[2*i + 4];
1549 d5 = buffd[2*i + 5];
1550 d0 = vis_fpadd16(d0, d00);
1551 d0 = vis_fpadd16(d0, d10);
1552 d1 = vis_fpadd16(d1, d01);
1553 d1 = vis_fpadd16(d1, d11);
1554 d2 = vis_fpadd16(d2, d02);
1555 d2 = vis_fpadd16(d2, d12);
1556 d3 = vis_fpadd16(d3, d03);
1557 d3 = vis_fpadd16(d3, d13);
1558 d4 = vis_fpadd16(d4, d04);
1559 d4 = vis_fpadd16(d4, d14);
1560 d5 = vis_fpadd16(d5, d05);
1561 d5 = vis_fpadd16(d5, d15);
1562
1563 buffe[i ] = vis_fpack16_pair(d0, d1);
1564 buffe[i + 1] = vis_fpack16_pair(d2, d3);
1565 buffe[i + 2] = vis_fpack16_pair(d4, d5);
1566
1567 buffd[2*i ] = drnd;
1568 buffd[2*i + 1] = drnd;
1569 buffd[2*i + 2] = drnd;
1570 buffd[2*i + 3] = drnd;
1571 buffd[2*i + 4] = drnd;
1572 buffd[2*i + 5] = drnd;
1573
1574 LOAD_SRC();
1575 }
1576
1577 } else /* if (jk_size == 3) */ {
1578 vis_write_gsr(gsr_scale + 7);
1579
1580#pragma pipeloop(0)
1581 for (i = 0; i < dsize; i += 3) {
1582 mlib_d64 d00, d01, d02, d03, d04, d05;
1583 mlib_d64 d10, d11, d12, d13, d14, d15;
1584 mlib_d64 d20, d21, d22, d23, d24, d25;
1585 mlib_d64 d0, d1, d2, d3, d4, d5;
1586 mlib_d64 s00 = buff0[i];
1587 mlib_d64 s01 = buff0[i + 1];
1588 mlib_d64 s02 = buff0[i + 2];
1589 mlib_d64 s10 = buff1[i];
1590 mlib_d64 s11 = buff1[i + 1];
1591 mlib_d64 s12 = buff1[i + 2];
1592 mlib_d64 s20 = buff2[i];
1593 mlib_d64 s21 = buff2[i + 1];
1594 mlib_d64 s22 = buff2[i + 2];
1595
1596 d00 = vis_fmul8x16au(vis_read_hi(s00), k0);
1597 d01 = vis_fmul8x16au(vis_read_lo(s00), k0);
1598 d02 = vis_fmul8x16au(vis_read_hi(s01), k0);
1599 d03 = vis_fmul8x16au(vis_read_lo(s01), k0);
1600 d04 = vis_fmul8x16au(vis_read_hi(s02), k0);
1601 d05 = vis_fmul8x16au(vis_read_lo(s02), k0);
1602 d10 = vis_fmul8x16au(vis_read_hi(s10), k1);
1603 d11 = vis_fmul8x16au(vis_read_lo(s10), k1);
1604 d12 = vis_fmul8x16au(vis_read_hi(s11), k1);
1605 d13 = vis_fmul8x16au(vis_read_lo(s11), k1);
1606 d14 = vis_fmul8x16au(vis_read_hi(s12), k1);
1607 d15 = vis_fmul8x16au(vis_read_lo(s12), k1);
1608 d20 = vis_fmul8x16au(vis_read_hi(s20), k2);
1609 d21 = vis_fmul8x16au(vis_read_lo(s20), k2);
1610 d22 = vis_fmul8x16au(vis_read_hi(s21), k2);
1611 d23 = vis_fmul8x16au(vis_read_lo(s21), k2);
1612 d24 = vis_fmul8x16au(vis_read_hi(s22), k2);
1613 d25 = vis_fmul8x16au(vis_read_lo(s22), k2);
1614
1615 d0 = buffd[2*i];
1616 d1 = buffd[2*i + 1];
1617 d2 = buffd[2*i + 2];
1618 d3 = buffd[2*i + 3];
1619 d4 = buffd[2*i + 4];
1620 d5 = buffd[2*i + 5];
1621 d0 = vis_fpadd16(d0, d00);
1622 d0 = vis_fpadd16(d0, d10);
1623 d0 = vis_fpadd16(d0, d20);
1624 d1 = vis_fpadd16(d1, d01);
1625 d1 = vis_fpadd16(d1, d11);
1626 d1 = vis_fpadd16(d1, d21);
1627 d2 = vis_fpadd16(d2, d02);
1628 d2 = vis_fpadd16(d2, d12);
1629 d2 = vis_fpadd16(d2, d22);
1630 d3 = vis_fpadd16(d3, d03);
1631 d3 = vis_fpadd16(d3, d13);
1632 d3 = vis_fpadd16(d3, d23);
1633 d4 = vis_fpadd16(d4, d04);
1634 d4 = vis_fpadd16(d4, d14);
1635 d4 = vis_fpadd16(d4, d24);
1636 d5 = vis_fpadd16(d5, d05);
1637 d5 = vis_fpadd16(d5, d15);
1638 d5 = vis_fpadd16(d5, d25);
1639
1640 buffe[i ] = vis_fpack16_pair(d0, d1);
1641 buffe[i + 1] = vis_fpack16_pair(d2, d3);
1642 buffe[i + 2] = vis_fpack16_pair(d4, d5);
1643
1644 buffd[2*i ] = drnd;
1645 buffd[2*i + 1] = drnd;
1646 buffd[2*i + 2] = drnd;
1647 buffd[2*i + 3] = drnd;
1648 buffd[2*i + 4] = drnd;
1649 buffd[2*i + 5] = drnd;
1650
1651 LOAD_SRC();
1652 }
1653 }
1654#endif /* CONV_INDEX */
1655
1656#ifdef CONV_INDEX
1657 mlib_ImageColorTrue2IndexLine_U8_S16_3((void*)buffe, dl, wid, colormap);
1658#endif /* CONV_INDEX */
1659
1660 sl += sll;
1661 dl += dll;
1662
1663 buff_ind++;
1664 if (buff_ind >= (n + 1)) buff_ind = 0;
1665 }
1666
1667 mlib_free(pbuff);
1668 if (buffs != buffs_local) mlib_free(buffs);
1669
1670 return MLIB_SUCCESS;
1671}
1672
1673/***************************************************************/