blob: 1db129b14e664827f08f527150ef4921bf5474f2 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2003 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26
27/*
28 * FUNCTION
29 * Internal functions for mlib_ImageConv* on U8/S16/U16 types and
30 * MLIB_EDGE_DST_NO_WRITE mask
31 */
32
33#include "mlib_image.h"
34#include "mlib_ImageConv.h"
35#include "mlib_c_ImageConv.h"
36
37/*
38 This define switches between functions of different data types
39*/
40#define IMG_TYPE 1
41
42/***************************************************************/
43#if IMG_TYPE == 1
44
45#define DTYPE mlib_u8
46#define CONV_FUNC(KERN) mlib_c_conv##KERN##nw_u8
47#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8
48#define DSCALE (1 << 24)
49#define FROM_S32(x) (((x) >> 24) ^ 128)
50#define S64TOS32(x) (x)
51#define SAT_OFF -(1u << 31)
52
53#elif IMG_TYPE == 2
54
55#define DTYPE mlib_s16
56#define CONV_FUNC(KERN) mlib_conv##KERN##nw_s16
57#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16
58#define DSCALE 65536.0
59#define FROM_S32(x) ((x) >> 16)
60#define S64TOS32(x) ((x) & 0xffffffff)
61#define SAT_OFF
62
63#elif IMG_TYPE == 3
64
65#define DTYPE mlib_u16
66#define CONV_FUNC(KERN) mlib_conv##KERN##nw_u16
67#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16
68#define DSCALE 65536.0
69#define FROM_S32(x) (((x) >> 16) ^ 0x8000)
70#define S64TOS32(x) (x)
71#define SAT_OFF -(1u << 31)
72
73#endif /* IMG_TYPE == 1 */
74
75/***************************************************************/
76#define BUFF_SIZE 1600
77
78#define CACHE_SIZE (64*1024)
79
80/***************************************************************/
81#define FTYPE mlib_d64
82
83#ifndef MLIB_USE_FTOI_CLAMPING
84
85#define CLAMP_S32(x) \
86 (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
87
88#else
89
90#define CLAMP_S32(x) ((mlib_s32)(x))
91
92#endif /* MLIB_USE_FTOI_CLAMPING */
93
94/***************************************************************/
95#define D2I(x) CLAMP_S32((x) SAT_OFF)
96
97/***************************************************************/
98#ifdef _LITTLE_ENDIAN
99
100#define STORE2(res0, res1) \
101 dp[0 ] = res1; \
102 dp[chan1] = res0
103
104#else
105
106#define STORE2(res0, res1) \
107 dp[0 ] = res0; \
108 dp[chan1] = res1
109
110#endif /* _LITTLE_ENDIAN */
111
112/***************************************************************/
113#ifdef _NO_LONGLONG
114
115#define LOAD_BUFF(buff) \
116 buff[i ] = sp[0]; \
117 buff[i + 1] = sp[chan1]
118
119#else /* _NO_LONGLONG */
120
121#ifdef _LITTLE_ENDIAN
122
123#define LOAD_BUFF(buff) \
124 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
125
126#else /* _LITTLE_ENDIAN */
127
128#define LOAD_BUFF(buff) \
129 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
130
131#endif /* _LITTLE_ENDIAN */
132#endif /* _NO_LONGLONG */
133
134/***************************************************************/
135typedef union {
136 mlib_d64 d64;
137 struct {
138 mlib_s32 i0;
139 mlib_s32 i1;
140 } i32s;
141 struct {
142 mlib_s32 f0;
143 mlib_s32 f1;
144 } f32s;
145} d64_2x32;
146
147/***************************************************************/
148#define BUFF_LINE 256
149
150/***************************************************************/
151#define DEF_VARS(type) \
152 type *adr_src, *sl, *sp; \
153 type *adr_dst, *dl, *dp; \
154 FTYPE *pbuff = buff; \
155 mlib_s32 wid, hgt, sll, dll; \
156 mlib_s32 nchannel, chan1; \
157 mlib_s32 i, j, c
158
159/***************************************************************/
160#define LOAD_KERNEL3() \
161 FTYPE scalef = DSCALE; \
162 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8; \
163 FTYPE p00, p01, p02, p03, \
164 p10, p11, p12, p13, \
165 p20, p21, p22, p23; \
166 \
167 while (scalef_expon > 30) { \
168 scalef /= (1 << 30); \
169 scalef_expon -= 30; \
170 } \
171 \
172 scalef /= (1 << scalef_expon); \
173 \
174 /* keep kernel in regs */ \
175 k0 = scalef * kern[0]; k1 = scalef * kern[1]; k2 = scalef * kern[2]; \
176 k3 = scalef * kern[3]; k4 = scalef * kern[4]; k5 = scalef * kern[5]; \
177 k6 = scalef * kern[6]; k7 = scalef * kern[7]; k8 = scalef * kern[8]
178
179/***************************************************************/
180#define LOAD_KERNEL(SIZE) \
181 FTYPE scalef = DSCALE; \
182 \
183 while (scalef_expon > 30) { \
184 scalef /= (1 << 30); \
185 scalef_expon -= 30; \
186 } \
187 \
188 scalef /= (1 << scalef_expon); \
189 \
190 for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
191
192/***************************************************************/
193#define GET_SRC_DST_PARAMETERS(type) \
194 hgt = mlib_ImageGetHeight(src); \
195 wid = mlib_ImageGetWidth(src); \
196 nchannel = mlib_ImageGetChannels(src); \
197 sll = mlib_ImageGetStride(src) / sizeof(type); \
198 dll = mlib_ImageGetStride(dst) / sizeof(type); \
199 adr_src = (type *)mlib_ImageGetData(src); \
200 adr_dst = (type *)mlib_ImageGetData(dst)
201
202/***************************************************************/
203#ifndef __sparc
204
205#if IMG_TYPE == 1
206
207/* Test for the presence of any "1" bit in bits
208 8 to 31 of val. If present, then val is either
209 negative or >255. If over/underflows of 8 bits
210 are uncommon, then this technique can be a win,
211 since only a single test, rather than two, is
212 necessary to determine if clamping is needed.
213 On the other hand, if over/underflows are common,
214 it adds an extra test.
215*/
216#define CLAMP_STORE(dst, val) \
217 if (val & 0xffffff00) { \
218 if (val < MLIB_U8_MIN) \
219 dst = MLIB_U8_MIN; \
220 else \
221 dst = MLIB_U8_MAX; \
222 } else { \
223 dst = (mlib_u8)val; \
224 }
225
226#elif IMG_TYPE == 2
227
228#define CLAMP_STORE(dst, val) \
229 if (val >= MLIB_S16_MAX) \
230 dst = MLIB_S16_MAX; \
231 else if (val <= MLIB_S16_MIN) \
232 dst = MLIB_S16_MIN; \
233 else \
234 dst = (mlib_s16)val
235
236#elif IMG_TYPE == 3
237
238#define CLAMP_STORE(dst, val) \
239 if (val >= MLIB_U16_MAX) \
240 dst = MLIB_U16_MAX; \
241 else if (val <= MLIB_U16_MIN) \
242 dst = MLIB_U16_MIN; \
243 else \
244 dst = (mlib_u16)val
245
246#endif /* IMG_TYPE == 1 */
247#endif /* __sparc */
248
249/***************************************************************/
250#define KSIZE 3
251
252mlib_status CONV_FUNC(3x3)(mlib_image *dst,
253 const mlib_image *src,
254 const mlib_s32 *kern,
255 mlib_s32 scalef_expon,
256 mlib_s32 cmask)
257{
258 FTYPE buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
259 DEF_VARS(DTYPE);
260 DTYPE *sl1;
261 mlib_s32 chan2;
262 mlib_s32 *buffo, *buffi;
263 DTYPE *sl2;
264#ifndef __sparc
265 mlib_s32 d0, d1;
266#endif /* __sparc */
267 LOAD_KERNEL3();
268 GET_SRC_DST_PARAMETERS(DTYPE);
269
270 if (wid > BUFF_LINE) {
271 pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE)*wid);
272
273 if (pbuff == NULL) return MLIB_FAILURE;
274 }
275
276 buff0 = pbuff;
277 buff1 = buff0 + wid;
278 buff2 = buff1 + wid;
279 buff3 = buff2 + wid;
280 buffo = (mlib_s32*)(buff3 + wid);
281 buffi = buffo + (wid &~ 1);
282
283 chan1 = nchannel;
284 chan2 = chan1 + chan1;
285
286 wid -= (KSIZE - 1);
287 hgt -= (KSIZE - 1);
288
289 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
290
291 for (c = 0; c < nchannel; c++) {
292 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
293
294 sl = adr_src + c;
295 dl = adr_dst + c;
296
297 sl1 = sl + sll;
298 sl2 = sl1 + sll;
299#ifdef __SUNPRO_C
300#pragma pipeloop(0)
301#endif /* __SUNPRO_C */
302 for (i = 0; i < wid + (KSIZE - 1); i++) {
303 buff0[i] = (FTYPE)sl[i*chan1];
304 buff1[i] = (FTYPE)sl1[i*chan1];
305 buff2[i] = (FTYPE)sl2[i*chan1];
306 }
307
308 sl += KSIZE*sll;
309
310 for (j = 0; j < hgt; j++) {
311 FTYPE s0, s1;
312
313 p02 = buff0[0];
314 p12 = buff1[0];
315 p22 = buff2[0];
316
317 p03 = buff0[1];
318 p13 = buff1[1];
319 p23 = buff2[1];
320
321 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
322 s1 = p03 * k0 + p13 * k3 + p23 * k6;
323
324 sp = sl;
325 dp = dl;
326
327#ifdef __SUNPRO_C
328#pragma pipeloop(0)
329#endif /* __SUNPRO_C */
330 for (i = 0; i <= (wid - 2); i += 2) {
331#ifdef __sparc
332#ifdef _NO_LONGLONG
333 mlib_s32 o64_1, o64_2;
334#else /* _NO_LONGLONG */
335 mlib_s64 o64;
336#endif /* _NO_LONGLONG */
337#endif /* __sparc */
338 d64_2x32 dd;
339
340 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
341 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
342
343 LOAD_BUFF(buffi);
344
345 dd.d64 = *(FTYPE *)(buffi + i);
346 buff3[i ] = (FTYPE)dd.i32s.i0;
347 buff3[i + 1] = (FTYPE)dd.i32s.i1;
348
349#ifndef __sparc
350 d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
351 d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
352
353 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
354 s1 = p03 * k0 + p13 * k3 + p23 * k6;
355
356 dp[0 ] = FROM_S32(d0);
357 dp[chan1] = FROM_S32(d1);
358
359#else /* __sparc */
360
361 dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
362 dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
363 *(FTYPE *)(buffo + i) = dd.d64;
364
365 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
366 s1 = p03 * k0 + p13 * k3 + p23 * k6;
367
368#ifdef _NO_LONGLONG
369
370 o64_1 = buffo[i];
371 o64_2 = buffo[i+1];
372#if IMG_TYPE != 1
373 STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
374#else
375 STORE2(o64_1 >> 24, o64_2 >> 24);
376#endif /* IMG_TYPE != 1 */
377
378#else /* _NO_LONGLONG */
379
380 o64 = *(mlib_s64*)(buffo + i);
381#if IMG_TYPE != 1
382 STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
383#else
384 STORE2(o64 >> 56, o64 >> 24);
385#endif /* IMG_TYPE != 1 */
386#endif /* _NO_LONGLONG */
387#endif /* __sparc */
388
389 sp += chan2;
390 dp += chan2;
391 }
392
393 for (; i < wid; i++) {
394 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i];
395 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
396 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
397
398 buffi[i] = (mlib_s32)sp[0];
399 buff3[i] = (FTYPE)buffi[i];
400
401#ifndef __sparc
402
403 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
404 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
405
406 dp[0] = FROM_S32(d0);
407
408#else /* __sparc */
409
410 buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
411 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
412#if IMG_TYPE != 1
413 dp[0] = FROM_S32(buffo[i]);
414#else
415 dp[0] = buffo[i] >> 24;
416#endif /* IMG_TYPE != 1 */
417#endif /* __sparc */
418
419 sp += chan1;
420 dp += chan1;
421 }
422
423 buffi[wid] = (mlib_s32)sp[0];
424 buff3[wid] = (FTYPE)buffi[wid];
425 buffi[wid + 1] = (mlib_s32)sp[chan1];
426 buff3[wid + 1] = (FTYPE)buffi[wid + 1];
427
428 sl += sll;
429 dl += dll;
430
431 buffT = buff0;
432 buff0 = buff1;
433 buff1 = buff2;
434 buff2 = buff3;
435 buff3 = buffT;
436 }
437 }
438
439#ifdef __sparc
440#if IMG_TYPE == 1
441 {
442 mlib_s32 amask = (1 << nchannel) - 1;
443
444 if ((cmask & amask) != amask) {
445 mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
446 } else {
447 mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
448 }
449 }
450
451#endif /* IMG_TYPE == 1 */
452#endif /* __sparc */
453
454 if (pbuff != buff) mlib_free(pbuff);
455
456 return MLIB_SUCCESS;
457}
458
459/***************************************************************/
460#ifndef __sparc /* for x86, using integer multiplies is faster */
461
462mlib_status CONV_FUNC_I(3x3)(mlib_image *dst,
463 const mlib_image *src,
464 const mlib_s32 *kern,
465 mlib_s32 scalef_expon,
466 mlib_s32 cmask)
467{
468 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2;
469 DTYPE *adr_dst, *dl, *dp;
470 mlib_s32 wid, hgt, sll, dll;
471 mlib_s32 nchannel, chan1, chan2;
472 mlib_s32 i, j, c;
473 mlib_s32 shift1, shift2;
474 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
475 mlib_s32 p02, p03,
476 p12, p13,
477 p22, p23;
478
479#if IMG_TYPE != 1
480 shift1 = 16;
481#else
482 shift1 = 8;
483#endif /* IMG_TYPE != 1 */
484
485 shift2 = scalef_expon - shift1;
486
487 /* keep kernel in regs */
488 k0 = kern[0] >> shift1; k1 = kern[1] >> shift1; k2 = kern[2] >> shift1;
489 k3 = kern[3] >> shift1; k4 = kern[4] >> shift1; k5 = kern[5] >> shift1;
490 k6 = kern[6] >> shift1; k7 = kern[7] >> shift1; k8 = kern[8] >> shift1;
491
492 GET_SRC_DST_PARAMETERS(DTYPE);
493
494 chan1 = nchannel;
495 chan2 = chan1 + chan1;
496
497 wid -= (KSIZE - 1);
498 hgt -= (KSIZE - 1);
499
500 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
501
502 for (c = 0; c < chan1; c++) {
503 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
504
505 sl = adr_src + c;
506 dl = adr_dst + c;
507
508 for (j = 0; j < hgt; j++) {
509 mlib_s32 s0, s1;
510 mlib_s32 pix0, pix1;
511
512 dp = dl;
513 sp0 = sl;
514 sp1 = sp0 + sll;
515 sp2 = sp1 + sll;
516
517 p02 = sp0[0];
518 p12 = sp1[0];
519 p22 = sp2[0];
520
521 p03 = sp0[chan1];
522 p13 = sp1[chan1];
523 p23 = sp2[chan1];
524
525 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
526 s1 = p03 * k0 + p13 * k3 + p23 * k6;
527
528 sp0 += chan2;
529 sp1 += chan2;
530 sp2 += chan2;
531
532#ifdef __SUNPRO_C
533#pragma pipeloop(0)
534#endif /* __SUNPRO_C */
535 for (i = 0; i <= (wid - 2); i += 2) {
536 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
537 p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
538
539 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
540 pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
541 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
542
543 CLAMP_STORE(dp[0], pix0)
544 CLAMP_STORE(dp[chan1], pix1)
545
546 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
547 s1 = p03 * k0 + p13 * k3 + p23 * k6;
548
549 sp0 += chan2;
550 sp1 += chan2;
551 sp2 += chan2;
552 dp += chan2;
553 }
554
555 if (wid & 1) {
556 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
557 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
558 CLAMP_STORE(dp[0], pix0)
559 }
560
561 sl += sll;
562 dl += dll;
563 }
564 }
565
566 return MLIB_SUCCESS;
567}
568
569#endif /* __sparc ( for x86, using integer multiplies is faster ) */
570
571/***************************************************************/
572#undef KSIZE
573#define KSIZE 4
574
575mlib_status CONV_FUNC(4x4)(mlib_image *dst,
576 const mlib_image *src,
577 const mlib_s32 *kern,
578 mlib_s32 scalef_expon,
579 mlib_s32 cmask)
580{
581 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
582 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
583 FTYPE k[KSIZE*KSIZE];
584 mlib_s32 d0, d1;
585 FTYPE k0, k1, k2, k3, k4, k5, k6, k7;
586 FTYPE p00, p01, p02, p03, p04,
587 p10, p11, p12, p13, p14,
588 p20, p21, p22, p23,
589 p30, p31, p32, p33;
590 DEF_VARS(DTYPE);
591 DTYPE *sl1;
592 mlib_s32 chan2;
593 mlib_s32 *buffo, *buffi;
594 DTYPE *sl2, *sl3;
595 LOAD_KERNEL(KSIZE*KSIZE);
596 GET_SRC_DST_PARAMETERS(DTYPE);
597
598 if (wid > BUFF_LINE) {
599 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
600
601 if (pbuff == NULL) return MLIB_FAILURE;
602 }
603
604 buff0 = pbuff;
605 buff1 = buff0 + wid;
606 buff2 = buff1 + wid;
607 buff3 = buff2 + wid;
608 buff4 = buff3 + wid;
609 buffd = buff4 + wid;
610 buffo = (mlib_s32*)(buffd + wid);
611 buffi = buffo + (wid &~ 1);
612
613 chan1 = nchannel;
614 chan2 = chan1 + chan1;
615
616 wid -= (KSIZE - 1);
617 hgt -= (KSIZE - 1);
618
619 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
620
621 for (c = 0; c < nchannel; c++) {
622 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
623
624 sl = adr_src + c;
625 dl = adr_dst + c;
626
627 sl1 = sl + sll;
628 sl2 = sl1 + sll;
629 sl3 = sl2 + sll;
630#ifdef __SUNPRO_C
631#pragma pipeloop(0)
632#endif /* __SUNPRO_C */
633 for (i = 0; i < wid + (KSIZE - 1); i++) {
634 buff0[i] = (FTYPE)sl[i*chan1];
635 buff1[i] = (FTYPE)sl1[i*chan1];
636 buff2[i] = (FTYPE)sl2[i*chan1];
637 buff3[i] = (FTYPE)sl3[i*chan1];
638 }
639
640 sl += KSIZE*sll;
641
642 for (j = 0; j < hgt; j++) {
643 d64_2x32 dd;
644
645 /*
646 * First loop on two first lines of kernel
647 */
648 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
649 k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
650
651 sp = sl;
652 dp = dl;
653
654 p02 = buff0[0];
655 p12 = buff1[0];
656 p03 = buff0[1];
657 p13 = buff1[1];
658 p04 = buff0[2];
659
660#ifdef __SUNPRO_C
661#pragma pipeloop(0)
662#endif /* __SUNPRO_C */
663 for (i = 0; i <= (wid - 2); i += 2) {
664 p00 = p02; p10 = p12;
665 p01 = p03; p11 = p13;
666 p02 = p04; p12 = buff1[i + 2];
667 p03 = buff0[i + 3]; p13 = buff1[i + 3];
668 p04 = buff0[i + 4]; p14 = buff1[i + 4];
669
670 LOAD_BUFF(buffi);
671
672 dd.d64 = *(FTYPE *)(buffi + i);
673 buff4[i ] = (FTYPE)dd.i32s.i0;
674 buff4[i + 1] = (FTYPE)dd.i32s.i1;
675
676 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
677 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
678 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
679 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
680
681 sp += chan2;
682 dp += chan2;
683 }
684
685 /*
686 * Second loop on two last lines of kernel
687 */
688 k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
689 k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
690
691 sp = sl;
692 dp = dl;
693
694 p02 = buff2[0];
695 p12 = buff3[0];
696 p03 = buff2[1];
697 p13 = buff3[1];
698 p04 = buff2[2];
699
700#ifdef __SUNPRO_C
701#pragma pipeloop(0)
702#endif /* __SUNPRO_C */
703 for (i = 0; i <= (wid - 2); i += 2) {
704 p00 = p02; p10 = p12;
705 p01 = p03; p11 = p13;
706 p02 = p04; p12 = buff3[i + 2];
707 p03 = buff2[i + 3]; p13 = buff3[i + 3];
708 p04 = buff2[i + 4]; p14 = buff3[i + 4];
709
710 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
711 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
712 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
713 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
714
715 dp[0 ] = FROM_S32(d0);
716 dp[chan1] = FROM_S32(d1);
717
718 sp += chan2;
719 dp += chan2;
720 }
721
722 /* last pixels */
723 for (; i < wid; i++) {
724 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
725 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
726 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
727 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
728
729 buff4[i] = (FTYPE)sp[0];
730
731 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
732 p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
733 p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
734 p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
735
736 dp[0] = FROM_S32(buffo[i]);
737
738 sp += chan1;
739 dp += chan1;
740 }
741
742 buff4[wid ] = (FTYPE)sp[0];
743 buff4[wid + 1] = (FTYPE)sp[chan1];
744 buff4[wid + 2] = (FTYPE)sp[chan2];
745
746 /* next line */
747 sl += sll;
748 dl += dll;
749
750 buffT = buff0;
751 buff0 = buff1;
752 buff1 = buff2;
753 buff2 = buff3;
754 buff3 = buff4;
755 buff4 = buffT;
756 }
757 }
758
759 if (pbuff != buff) mlib_free(pbuff);
760
761 return MLIB_SUCCESS;
762}
763
764/***************************************************************/
765#undef KSIZE
766#define KSIZE 5
767
768mlib_status CONV_FUNC(5x5)(mlib_image *dst,
769 const mlib_image *src,
770 const mlib_s32 *kern,
771 mlib_s32 scalef_expon,
772 mlib_s32 cmask)
773{
774 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
775 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
776 FTYPE k[KSIZE*KSIZE];
777 mlib_s32 d0, d1;
778 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
779 FTYPE p00, p01, p02, p03, p04, p05,
780 p10, p11, p12, p13, p14, p15,
781 p20, p21, p22, p23, p24,
782 p30, p31, p32, p33, p34,
783 p40, p41, p42, p43, p44;
784 DEF_VARS(DTYPE);
785 DTYPE *sl1;
786 mlib_s32 chan2;
787 mlib_s32 *buffo, *buffi;
788 DTYPE *sl2, *sl3, *sl4;
789 LOAD_KERNEL(KSIZE*KSIZE);
790 GET_SRC_DST_PARAMETERS(DTYPE);
791
792 if (wid > BUFF_LINE) {
793 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
794
795 if (pbuff == NULL) return MLIB_FAILURE;
796 }
797
798 buff0 = pbuff;
799 buff1 = buff0 + wid;
800 buff2 = buff1 + wid;
801 buff3 = buff2 + wid;
802 buff4 = buff3 + wid;
803 buff5 = buff4 + wid;
804 buffd = buff5 + wid;
805 buffo = (mlib_s32*)(buffd + wid);
806 buffi = buffo + (wid &~ 1);
807
808 chan1 = nchannel;
809 chan2 = chan1 + chan1;
810
811 wid -= (KSIZE - 1);
812 hgt -= (KSIZE - 1);
813
814 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
815
816 for (c = 0; c < nchannel; c++) {
817 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
818
819 sl = adr_src + c;
820 dl = adr_dst + c;
821
822 sl1 = sl + sll;
823 sl2 = sl1 + sll;
824 sl3 = sl2 + sll;
825 sl4 = sl3 + sll;
826#ifdef __SUNPRO_C
827#pragma pipeloop(0)
828#endif /* __SUNPRO_C */
829 for (i = 0; i < wid + (KSIZE - 1); i++) {
830 buff0[i] = (FTYPE)sl[i*chan1];
831 buff1[i] = (FTYPE)sl1[i*chan1];
832 buff2[i] = (FTYPE)sl2[i*chan1];
833 buff3[i] = (FTYPE)sl3[i*chan1];
834 buff4[i] = (FTYPE)sl4[i*chan1];
835 }
836
837 sl += KSIZE*sll;
838
839 for (j = 0; j < hgt; j++) {
840 d64_2x32 dd;
841
842 /*
843 * First loop
844 */
845 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
846 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
847
848 sp = sl;
849 dp = dl;
850
851 p02 = buff0[0];
852 p12 = buff1[0];
853 p03 = buff0[1];
854 p13 = buff1[1];
855 p04 = buff0[2];
856 p14 = buff1[2];
857
858#ifdef __SUNPRO_C
859#pragma pipeloop(0)
860#endif /* __SUNPRO_C */
861 for (i = 0; i <= (wid - 2); i += 2) {
862 p00 = p02; p10 = p12;
863 p01 = p03; p11 = p13;
864 p02 = p04; p12 = p14;
865
866 LOAD_BUFF(buffi);
867
868 p03 = buff0[i + 3]; p13 = buff1[i + 3];
869 p04 = buff0[i + 4]; p14 = buff1[i + 4];
870 p05 = buff0[i + 5]; p15 = buff1[i + 5];
871
872 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
873 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
874 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
875 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
876
877 sp += chan2;
878 dp += chan2;
879 }
880
881 /*
882 * Second loop
883 */
884 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
885 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
886
887 sp = sl;
888 dp = dl;
889
890 p02 = buff2[0];
891 p12 = buff3[0];
892 p03 = buff2[1];
893 p13 = buff3[1];
894 p04 = buff2[2];
895 p14 = buff3[2];
896
897#ifdef __SUNPRO_C
898#pragma pipeloop(0)
899#endif /* __SUNPRO_C */
900 for (i = 0; i <= (wid - 2); i += 2) {
901 p00 = p02; p10 = p12;
902 p01 = p03; p11 = p13;
903
904 p02 = buff2[i + 2]; p12 = buff3[i + 2];
905 p03 = buff2[i + 3]; p13 = buff3[i + 3];
906 p04 = buff2[i + 4]; p14 = buff3[i + 4];
907 p05 = buff2[i + 5]; p15 = buff3[i + 5];
908
909 dd.d64 = *(FTYPE *)(buffi + i);
910 buff5[i ] = (FTYPE)dd.i32s.i0;
911 buff5[i + 1] = (FTYPE)dd.i32s.i1;
912
913 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
914 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
915 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
916 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
917
918 sp += chan2;
919 dp += chan2;
920 }
921
922 /*
923 * 3 loop
924 */
925 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
926
927 sp = sl;
928 dp = dl;
929
930 p02 = buff4[0];
931 p03 = buff4[1];
932 p04 = buff4[2];
933 p05 = buff4[3];
934
935#ifdef __SUNPRO_C
936#pragma pipeloop(0)
937#endif /* __SUNPRO_C */
938 for (i = 0; i <= (wid - 2); i += 2) {
939 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
940
941 p04 = buff4[i + 4]; p05 = buff4[i + 5];
942
943 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
944 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
945
946 dp[0 ] = FROM_S32(d0);
947 dp[chan1] = FROM_S32(d1);
948
949 sp += chan2;
950 dp += chan2;
951 }
952
953 /* last pixels */
954 for (; i < wid; i++) {
955 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
956 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
957 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
958 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
959 p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
960
961 p40 = buff4[i]; p41 = buff4[i + 1]; p42 = buff4[i + 2];
962 p43 = buff4[i + 3]; p44 = buff4[i + 4];
963
964 buff5[i] = (FTYPE)sp[0];
965
966 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
967 p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
968 p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
969 p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
970 p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
971
972 dp[0] = FROM_S32(buffo[i]);
973
974 sp += chan1;
975 dp += chan1;
976 }
977
978 buff5[wid ] = (FTYPE)sp[0];
979 buff5[wid + 1] = (FTYPE)sp[chan1];
980 buff5[wid + 2] = (FTYPE)sp[chan2];
981 buff5[wid + 3] = (FTYPE)sp[chan2 + chan1];
982
983 /* next line */
984 sl += sll;
985 dl += dll;
986
987 buffT = buff0;
988 buff0 = buff1;
989 buff1 = buff2;
990 buff2 = buff3;
991 buff3 = buff4;
992 buff4 = buff5;
993 buff5 = buffT;
994 }
995 }
996
997 if (pbuff != buff) mlib_free(pbuff);
998
999 return MLIB_SUCCESS;
1000}
1001
1002/***************************************************************/
1003#ifndef __sparc /* for x86, using integer multiplies is faster */
1004
1005mlib_status CONV_FUNC_I(5x5)(mlib_image *dst,
1006 const mlib_image *src,
1007 const mlib_s32 *kern,
1008 mlib_s32 scalef_expon,
1009 mlib_s32 cmask)
1010{
1011 mlib_s32 buff[BUFF_LINE];
1012 mlib_s32 *buffd;
1013 mlib_s32 k[KSIZE*KSIZE];
1014 mlib_s32 shift1, shift2;
1015 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1016 mlib_s32 p00, p01, p02, p03, p04, p05,
1017 p10, p11, p12, p13, p14, p15;
1018 DTYPE *adr_src, *sl, *sp0, *sp1;
1019 DTYPE *adr_dst, *dl, *dp;
1020 mlib_s32 *pbuff = buff;
1021 mlib_s32 wid, hgt, sll, dll;
1022 mlib_s32 nchannel, chan1, chan2, chan3, chan4;
1023 mlib_s32 i, j, c;
1024
1025#if IMG_TYPE != 1
1026 shift1 = 16;
1027#else
1028 shift1 = 8;
1029#endif /* IMG_TYPE != 1 */
1030
1031 shift2 = scalef_expon - shift1;
1032
1033 for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1034
1035 GET_SRC_DST_PARAMETERS(DTYPE);
1036
1037 if (wid > BUFF_LINE) {
1038 pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1039
1040 if (pbuff == NULL) return MLIB_FAILURE;
1041 }
1042
1043 buffd = pbuff;
1044
1045 chan1 = nchannel;
1046 chan2 = chan1 + chan1;
1047 chan3 = chan2 + chan1;
1048 chan4 = chan3 + chan1;
1049
1050 wid -= (KSIZE - 1);
1051 hgt -= (KSIZE - 1);
1052
1053 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1054
1055 for (c = 0; c < chan1; c++) {
1056 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1057
1058 sl = adr_src + c;
1059 dl = adr_dst + c;
1060
1061 for (j = 0; j < hgt; j++) {
1062 mlib_s32 pix0, pix1;
1063 /*
1064 * First loop
1065 */
1066 sp0 = sl;
1067 sp1 = sp0 + sll;
1068 dp = dl;
1069
1070 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1071 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1072
1073 p02 = sp0[0]; p12 = sp1[0];
1074 p03 = sp0[chan1]; p13 = sp1[chan1];
1075 p04 = sp0[chan2]; p14 = sp1[chan2];
1076 p05 = sp0[chan3]; p15 = sp1[chan3];
1077
1078 sp0 += chan4;
1079 sp1 += chan4;
1080
1081#ifdef __SUNPRO_C
1082#pragma pipeloop(0)
1083#endif /* __SUNPRO_C */
1084 for (i = 0; i <= (wid - 2); i += 2) {
1085 p00 = p02; p10 = p12;
1086 p01 = p03; p11 = p13;
1087 p02 = p04; p12 = p14;
1088 p03 = p05; p13 = p15;
1089
1090 p04 = sp0[0]; p14 = sp1[0];
1091 p05 = sp0[chan1]; p15 = sp1[chan1];
1092
1093 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1094 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1095 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1096 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1097
1098 sp0 += chan2;
1099 sp1 += chan2;
1100 dp += chan2;
1101 }
1102
1103 if (wid & 1) {
1104 p00 = p02; p10 = p12;
1105 p01 = p03; p11 = p13;
1106 p02 = p04; p12 = p14;
1107 p03 = p05; p13 = p15;
1108
1109 p04 = sp0[0]; p14 = sp1[0];
1110
1111 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1112 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1113 }
1114
1115 /*
1116 * Second loop
1117 */
1118 sp0 = sl + 2*sll;
1119 sp1 = sp0 + sll;
1120 dp = dl;
1121
1122 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1123 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1124
1125 p02 = sp0[0]; p12 = sp1[0];
1126 p03 = sp0[chan1]; p13 = sp1[chan1];
1127 p04 = sp0[chan2]; p14 = sp1[chan2];
1128 p05 = sp0[chan3]; p15 = sp1[chan3];
1129
1130 sp0 += chan4;
1131 sp1 += chan4;
1132
1133#ifdef __SUNPRO_C
1134#pragma pipeloop(0)
1135#endif /* __SUNPRO_C */
1136 for (i = 0; i <= (wid - 2); i += 2) {
1137 p00 = p02; p10 = p12;
1138 p01 = p03; p11 = p13;
1139 p02 = p04; p12 = p14;
1140 p03 = p05; p13 = p15;
1141
1142 p04 = sp0[0]; p14 = sp1[0];
1143 p05 = sp0[chan1]; p15 = sp1[chan1];
1144
1145 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1146 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1147 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1148 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1149
1150 sp0 += chan2;
1151 sp1 += chan2;
1152 dp += chan2;
1153 }
1154
1155 if (wid & 1) {
1156 p00 = p02; p10 = p12;
1157 p01 = p03; p11 = p13;
1158 p02 = p04; p12 = p14;
1159 p03 = p05; p13 = p15;
1160
1161 p04 = sp0[0]; p14 = sp1[0];
1162
1163 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1164 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1165 }
1166
1167 /*
1168 * 3 loop
1169 */
1170 dp = dl;
1171 sp0 = sl + 4*sll;
1172
1173 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1174
1175 p02 = sp0[0];
1176 p03 = sp0[chan1];
1177 p04 = sp0[chan2];
1178 p05 = sp0[chan3];
1179
1180 sp0 += chan2 + chan2;
1181
1182#ifdef __SUNPRO_C
1183#pragma pipeloop(0)
1184#endif /* __SUNPRO_C */
1185 for (i = 0; i <= (wid - 2); i += 2) {
1186 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1187
1188 p04 = sp0[0]; p05 = sp0[chan1];
1189
1190 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1191 p03 * k3 + p04 * k4) >> shift2;
1192 pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1193 p04 * k3 + p05 * k4) >> shift2;
1194
1195 CLAMP_STORE(dp[0], pix0)
1196 CLAMP_STORE(dp[chan1], pix1)
1197
1198 dp += chan2;
1199 sp0 += chan2;
1200 }
1201
1202 if (wid & 1) {
1203 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1204
1205 p04 = sp0[0];
1206
1207 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1208 p03 * k3 + p04 * k4) >> shift2;
1209 CLAMP_STORE(dp[0], pix0)
1210 }
1211
1212 /* next line */
1213 sl += sll;
1214 dl += dll;
1215 }
1216 }
1217
1218 if (pbuff != buff) mlib_free(pbuff);
1219
1220 return MLIB_SUCCESS;
1221}
1222
1223#endif /* __sparc ( for x86, using integer multiplies is faster ) */
1224
1225/***************************************************************/
1226#if IMG_TYPE == 1
1227
1228#undef KSIZE
1229#define KSIZE 7
1230
1231mlib_status CONV_FUNC(7x7)(mlib_image *dst,
1232 const mlib_image *src,
1233 const mlib_s32 *kern,
1234 mlib_s32 scalef_expon,
1235 mlib_s32 cmask)
1236{
1237 FTYPE buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1238 FTYPE k[KSIZE*KSIZE];
1239 mlib_s32 l, m, buff_ind;
1240 mlib_s32 d0, d1;
1241 FTYPE k0, k1, k2, k3, k4, k5, k6;
1242 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1243 DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1244 DEF_VARS(DTYPE);
1245 DTYPE *sl1;
1246 mlib_s32 chan2;
1247 mlib_s32 *buffo, *buffi;
1248 LOAD_KERNEL(KSIZE*KSIZE);
1249 GET_SRC_DST_PARAMETERS(DTYPE);
1250
1251 if (wid > BUFF_LINE) {
1252 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
1253
1254 if (pbuff == NULL) return MLIB_FAILURE;
1255 }
1256
1257 for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*wid;
1258 for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1259 buffd = buffs[KSIZE] + wid;
1260 buffo = (mlib_s32*)(buffd + wid);
1261 buffi = buffo + (wid &~ 1);
1262
1263 chan1 = nchannel;
1264 chan2 = chan1 + chan1;
1265
1266 wid -= (KSIZE - 1);
1267 hgt -= (KSIZE - 1);
1268
1269 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1270
1271 for (c = 0; c < nchannel; c++) {
1272 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1273
1274 sl = adr_src + c;
1275 dl = adr_dst + c;
1276
1277 sl1 = sl + sll;
1278 sl2 = sl1 + sll;
1279 sl3 = sl2 + sll;
1280 sl4 = sl3 + sll;
1281 sl5 = sl4 + sll;
1282 sl6 = sl5 + sll;
1283#ifdef __SUNPRO_C
1284#pragma pipeloop(0)
1285#endif /* __SUNPRO_C */
1286 for (i = 0; i < wid + (KSIZE - 1); i++) {
1287 buffs[0][i] = (FTYPE)sl[i*chan1];
1288 buffs[1][i] = (FTYPE)sl1[i*chan1];
1289 buffs[2][i] = (FTYPE)sl2[i*chan1];
1290 buffs[3][i] = (FTYPE)sl3[i*chan1];
1291 buffs[4][i] = (FTYPE)sl4[i*chan1];
1292 buffs[5][i] = (FTYPE)sl5[i*chan1];
1293 buffs[6][i] = (FTYPE)sl6[i*chan1];
1294 }
1295
1296 buff_ind = 0;
1297
1298#ifdef __SUNPRO_C
1299#pragma pipeloop(0)
1300#endif /* __SUNPRO_C */
1301 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1302
1303 sl += KSIZE*sll;
1304
1305 for (j = 0; j < hgt; j++) {
1306 FTYPE **buffc = buffs + buff_ind;
1307 FTYPE *buffn = buffc[KSIZE];
1308 FTYPE *pk = k;
1309
1310 for (l = 0; l < KSIZE; l++) {
1311 FTYPE *buff = buffc[l];
1312 d64_2x32 dd;
1313
1314 sp = sl;
1315 dp = dl;
1316
1317 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1318 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1319
1320 k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1321 k4 = *pk++; k5 = *pk++; k6 = *pk++;
1322
1323 if (l < (KSIZE - 1)) {
1324#ifdef __SUNPRO_C
1325#pragma pipeloop(0)
1326#endif /* __SUNPRO_C */
1327 for (i = 0; i <= (wid - 2); i += 2) {
1328 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1329
1330 p6 = buff[i + 6]; p7 = buff[i + 7];
1331
1332 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1333 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1334 }
1335
1336 } else {
1337#ifdef __SUNPRO_C
1338#pragma pipeloop(0)
1339#endif /* __SUNPRO_C */
1340 for (i = 0; i <= (wid - 2); i += 2) {
1341 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1342
1343 p6 = buff[i + 6]; p7 = buff[i + 7];
1344
1345 LOAD_BUFF(buffi);
1346
1347 dd.d64 = *(FTYPE *)(buffi + i);
1348 buffn[i ] = (FTYPE)dd.i32s.i0;
1349 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1350
1351 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1352 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1353
1354 dp[0 ] = FROM_S32(d0);
1355 dp[chan1] = FROM_S32(d1);
1356
1357 buffd[i ] = 0.0;
1358 buffd[i + 1] = 0.0;
1359
1360 sp += chan2;
1361 dp += chan2;
1362 }
1363 }
1364 }
1365
1366 /* last pixels */
1367 for (; i < wid; i++) {
1368 FTYPE *pk = k, s = 0;
1369 mlib_s32 d0;
1370
1371 for (l = 0; l < KSIZE; l++) {
1372 FTYPE *buff = buffc[l] + i;
1373
1374 for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1375 }
1376
1377 d0 = D2I(s);
1378 dp[0] = FROM_S32(d0);
1379
1380 buffn[i] = (FTYPE)sp[0];
1381
1382 sp += chan1;
1383 dp += chan1;
1384 }
1385
1386 for (l = 0; l < (KSIZE - 1); l++) buffn[wid + l] = sp[l*chan1];
1387
1388 /* next line */
1389 sl += sll;
1390 dl += dll;
1391
1392 buff_ind++;
1393
1394 if (buff_ind >= KSIZE + 1) buff_ind = 0;
1395 }
1396 }
1397
1398 if (pbuff != buff) mlib_free(pbuff);
1399
1400 return MLIB_SUCCESS;
1401}
1402
1403#endif /* IMG_TYPE == 1 */
1404
1405/***************************************************************/
1406#define MAX_KER 7
1407#define MAX_N 15
1408
1409static mlib_status mlib_ImageConv1xN(mlib_image *dst,
1410 const mlib_image *src,
1411 const mlib_d64 *k,
1412 mlib_s32 n,
1413 mlib_s32 dn,
1414 mlib_s32 cmask)
1415{
1416 FTYPE buff[BUFF_SIZE];
1417 mlib_s32 off, kh;
1418 mlib_s32 d0, d1;
1419 const FTYPE *pk;
1420 FTYPE k0, k1, k2, k3;
1421 FTYPE p0, p1, p2, p3, p4;
1422 DEF_VARS(DTYPE);
1423 DTYPE *sl_c, *dl_c, *sl0;
1424 mlib_s32 l, hsize, max_hsize;
1425 GET_SRC_DST_PARAMETERS(DTYPE);
1426
1427 hgt -= (n - 1);
1428 adr_dst += dn*dll;
1429
1430 max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
1431
1432 if (!max_hsize) max_hsize = 1;
1433
1434 if (max_hsize > BUFF_SIZE) {
1435 pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
1436 }
1437
1438 chan1 = nchannel;
1439
1440 sl_c = adr_src;
1441 dl_c = adr_dst;
1442
1443 for (l = 0; l < hgt; l += hsize) {
1444 hsize = hgt - l;
1445
1446 if (hsize > max_hsize) hsize = max_hsize;
1447
1448 for (c = 0; c < nchannel; c++) {
1449 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1450
1451 sl = sl_c + c;
1452 dl = dl_c + c;
1453
1454#ifdef __SUNPRO_C
1455#pragma pipeloop(0)
1456#endif /* __SUNPRO_C */
1457 for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
1458
1459 for (i = 0; i < wid; i++) {
1460 sl0 = sl;
1461
1462 for (off = 0; off < (n - 4); off += 4) {
1463 pk = k + off;
1464 sp = sl0;
1465
1466 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1467 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1468 sp += 3*sll;
1469
1470#ifdef __SUNPRO_C
1471#pragma pipeloop(0)
1472#endif /* __SUNPRO_C */
1473 for (j = 0; j < hsize; j += 2) {
1474 p0 = p2; p1 = p3; p2 = p4;
1475 p3 = sp[0];
1476 p4 = sp[sll];
1477
1478 pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1479 pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1480
1481 sp += 2*sll;
1482 }
1483
1484 sl0 += 4*sll;
1485 }
1486
1487 pk = k + off;
1488 sp = sl0;
1489
1490 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1491 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1492
1493 dp = dl;
1494 kh = n - off;
1495
1496 if (kh == 4) {
1497 sp += 3*sll;
1498
1499#ifdef __SUNPRO_C
1500#pragma pipeloop(0)
1501#endif /* __SUNPRO_C */
1502 for (j = 0; j <= (hsize - 2); j += 2) {
1503 p0 = p2; p1 = p3; p2 = p4;
1504 p3 = sp[0];
1505 p4 = sp[sll];
1506
1507 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1508 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);
1509
1510 dp[0 ] = FROM_S32(d0);
1511 dp[dll] = FROM_S32(d1);
1512
1513 pbuff[j] = 0;
1514 pbuff[j + 1] = 0;
1515
1516 sp += 2*sll;
1517 dp += 2*dll;
1518 }
1519
1520 if (j < hsize) {
1521 p0 = p2; p1 = p3; p2 = p4;
1522 p3 = sp[0];
1523
1524 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1525
1526 pbuff[j] = 0;
1527
1528 dp[0] = FROM_S32(d0);
1529 }
1530
1531 } else if (kh == 3) {
1532 sp += 2*sll;
1533
1534#ifdef __SUNPRO_C
1535#pragma pipeloop(0)
1536#endif /* __SUNPRO_C */
1537 for (j = 0; j <= (hsize - 2); j += 2) {
1538 p0 = p2; p1 = p3;
1539 p2 = sp[0];
1540 p3 = sp[sll];
1541
1542 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1543 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);
1544
1545 dp[0 ] = FROM_S32(d0);
1546 dp[dll] = FROM_S32(d1);
1547
1548 pbuff[j] = 0;
1549 pbuff[j + 1] = 0;
1550
1551 sp += 2*sll;
1552 dp += 2*dll;
1553 }
1554
1555 if (j < hsize) {
1556 p0 = p2; p1 = p3;
1557 p2 = sp[0];
1558
1559 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1560
1561 pbuff[j] = 0;
1562
1563 dp[0] = FROM_S32(d0);
1564 }
1565
1566 } else if (kh == 2) {
1567 sp += sll;
1568
1569#ifdef __SUNPRO_C
1570#pragma pipeloop(0)
1571#endif /* __SUNPRO_C */
1572 for (j = 0; j <= (hsize - 2); j += 2) {
1573 p0 = p2;
1574 p1 = sp[0];
1575 p2 = sp[sll];
1576
1577 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1578 d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);
1579
1580 dp[0 ] = FROM_S32(d0);
1581 dp[dll] = FROM_S32(d1);
1582
1583 pbuff[j] = 0;
1584 pbuff[j + 1] = 0;
1585
1586 sp += 2*sll;
1587 dp += 2*dll;
1588 }
1589
1590 if (j < hsize) {
1591 p0 = p2;
1592 p1 = sp[0];
1593
1594 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1595
1596 pbuff[j] = 0;
1597
1598 dp[0] = FROM_S32(d0);
1599 }
1600
1601 } else /* if (kh == 1) */ {
1602#ifdef __SUNPRO_C
1603#pragma pipeloop(0)
1604#endif /* __SUNPRO_C */
1605 for (j = 0; j < hsize; j++) {
1606 p0 = sp[0];
1607
1608 d0 = D2I(p0*k0 + pbuff[j]);
1609
1610 dp[0] = FROM_S32(d0);
1611
1612 pbuff[j] = 0;
1613
1614 sp += sll;
1615 dp += dll;
1616 }
1617 }
1618
1619 sl += chan1;
1620 dl += chan1;
1621 }
1622 }
1623
1624 sl_c += max_hsize*sll;
1625 dl_c += max_hsize*dll;
1626 }
1627
1628 if (pbuff != buff) mlib_free(pbuff);
1629
1630 return MLIB_SUCCESS;
1631}
1632
1633/***************************************************************/
1634mlib_status CONV_FUNC(MxN)(mlib_image *dst,
1635 const mlib_image *src,
1636 const mlib_s32 *kernel,
1637 mlib_s32 m,
1638 mlib_s32 n,
1639 mlib_s32 dm,
1640 mlib_s32 dn,
1641 mlib_s32 scale,
1642 mlib_s32 cmask)
1643{
1644 FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1645 FTYPE **buffs = buffs_arr, *buffd;
1646 FTYPE akernel[256], *k = akernel, fscale = DSCALE;
1647 mlib_s32 mn, l, off, kw, bsize, buff_ind;
1648 mlib_s32 d0, d1;
1649 FTYPE k0, k1, k2, k3, k4, k5, k6;
1650 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1651 d64_2x32 dd;
1652 DEF_VARS(DTYPE);
1653 mlib_s32 chan2;
1654 mlib_s32 *buffo, *buffi;
1655 GET_SRC_DST_PARAMETERS(DTYPE);
1656
1657 if (scale > 30) {
1658 fscale *= 1.0/(1 << 30);
1659 scale -= 30;
1660 }
1661
1662 fscale /= (1 << scale);
1663
1664 mn = m*n;
1665
1666 if (mn > 256) {
1667 k = mlib_malloc(mn*sizeof(mlib_d64));
1668
1669 if (k == NULL) return MLIB_FAILURE;
1670 }
1671
1672 for (i = 0; i < mn; i++) {
1673 k[i] = kernel[i]*fscale;
1674 }
1675
1676 if (m == 1) return mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
1677
1678 bsize = (n + 3)*wid;
1679
1680 if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1681 pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
1682
1683 if (pbuff == NULL) return MLIB_FAILURE;
1684 buffs = (FTYPE **)(pbuff + bsize);
1685 }
1686
1687 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
1688 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1689 buffd = buffs[n] + wid;
1690 buffo = (mlib_s32*)(buffd + wid);
1691 buffi = buffo + (wid &~ 1);
1692
1693 chan1 = nchannel;
1694 chan2 = chan1 + chan1;
1695
1696 wid -= (m - 1);
1697 hgt -= (n - 1);
1698 adr_dst += dn*dll + dm*nchannel;
1699
1700 for (c = 0; c < nchannel; c++) {
1701 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1702
1703 sl = adr_src + c;
1704 dl = adr_dst + c;
1705
1706 for (l = 0; l < n; l++) {
1707 FTYPE *buff = buffs[l];
1708
1709#ifdef __SUNPRO_C
1710#pragma pipeloop(0)
1711#endif /* __SUNPRO_C */
1712 for (i = 0; i < wid + (m - 1); i++) {
1713 buff[i] = (FTYPE)sl[i*chan1];
1714 }
1715
1716 sl += sll;
1717 }
1718
1719 buff_ind = 0;
1720
1721#ifdef __SUNPRO_C
1722#pragma pipeloop(0)
1723#endif /* __SUNPRO_C */
1724 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1725
1726 for (j = 0; j < hgt; j++) {
1727 FTYPE **buffc = buffs + buff_ind;
1728 FTYPE *buffn = buffc[n];
1729 FTYPE *pk = k;
1730
1731 for (l = 0; l < n; l++) {
1732 FTYPE *buff_l = buffc[l];
1733
1734 for (off = 0; off < m;) {
1735 FTYPE *buff = buff_l + off;
1736
1737 kw = m - off;
1738
1739 if (kw > 2*MAX_KER) kw = MAX_KER; else
1740 if (kw > MAX_KER) kw = kw/2;
1741 off += kw;
1742
1743 sp = sl;
1744 dp = dl;
1745
1746 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1747 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1748
1749 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1750 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1751 pk += kw;
1752
1753 if (kw == 7) {
1754
1755 if (l < (n - 1) || off < m) {
1756#ifdef __SUNPRO_C
1757#pragma pipeloop(0)
1758#endif /* __SUNPRO_C */
1759 for (i = 0; i <= (wid - 2); i += 2) {
1760 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1761
1762 p6 = buff[i + 6]; p7 = buff[i + 7];
1763
1764 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1765 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1766 }
1767
1768 } else {
1769#ifdef __SUNPRO_C
1770#pragma pipeloop(0)
1771#endif /* __SUNPRO_C */
1772 for (i = 0; i <= (wid - 2); i += 2) {
1773 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1774
1775 p6 = buff[i + 6]; p7 = buff[i + 7];
1776
1777 LOAD_BUFF(buffi);
1778
1779 dd.d64 = *(FTYPE *)(buffi + i);
1780 buffn[i ] = (FTYPE)dd.i32s.i0;
1781 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1782
1783 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1784 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1785
1786 dp[0 ] = FROM_S32(d0);
1787 dp[chan1] = FROM_S32(d1);
1788
1789 buffd[i ] = 0.0;
1790 buffd[i + 1] = 0.0;
1791
1792 sp += chan2;
1793 dp += chan2;
1794 }
1795 }
1796
1797 } else if (kw == 6) {
1798
1799 if (l < (n - 1) || off < m) {
1800#ifdef __SUNPRO_C
1801#pragma pipeloop(0)
1802#endif /* __SUNPRO_C */
1803 for (i = 0; i <= (wid - 2); i += 2) {
1804 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1805
1806 p5 = buff[i + 5]; p6 = buff[i + 6];
1807
1808 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1809 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1810 }
1811
1812 } else {
1813#ifdef __SUNPRO_C
1814#pragma pipeloop(0)
1815#endif /* __SUNPRO_C */
1816 for (i = 0; i <= (wid - 2); i += 2) {
1817 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1818
1819 p5 = buff[i + 5]; p6 = buff[i + 6];
1820
1821 buffn[i ] = (FTYPE)sp[0];
1822 buffn[i + 1] = (FTYPE)sp[chan1];
1823
1824 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
1825 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1826
1827 dp[0 ] = FROM_S32(d0);
1828 dp[chan1] = FROM_S32(d1);
1829
1830 buffd[i ] = 0.0;
1831 buffd[i + 1] = 0.0;
1832
1833 sp += chan2;
1834 dp += chan2;
1835 }
1836 }
1837
1838 } else if (kw == 5) {
1839
1840 if (l < (n - 1) || off < m) {
1841#ifdef __SUNPRO_C
1842#pragma pipeloop(0)
1843#endif /* __SUNPRO_C */
1844 for (i = 0; i <= (wid - 2); i += 2) {
1845 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1846
1847 p4 = buff[i + 4]; p5 = buff[i + 5];
1848
1849 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1850 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1851 }
1852
1853 } else {
1854#ifdef __SUNPRO_C
1855#pragma pipeloop(0)
1856#endif /* __SUNPRO_C */
1857 for (i = 0; i <= (wid - 2); i += 2) {
1858 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1859
1860 p4 = buff[i + 4]; p5 = buff[i + 5];
1861
1862 buffn[i ] = (FTYPE)sp[0];
1863 buffn[i + 1] = (FTYPE)sp[chan1];
1864
1865 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
1866 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1867
1868 dp[0 ] = FROM_S32(d0);
1869 dp[chan1] = FROM_S32(d1);
1870
1871 buffd[i ] = 0.0;
1872 buffd[i + 1] = 0.0;
1873
1874 sp += chan2;
1875 dp += chan2;
1876 }
1877 }
1878
1879 } else if (kw == 4) {
1880
1881 if (l < (n - 1) || off < m) {
1882#ifdef __SUNPRO_C
1883#pragma pipeloop(0)
1884#endif /* __SUNPRO_C */
1885 for (i = 0; i <= (wid - 2); i += 2) {
1886 p0 = p2; p1 = p3; p2 = p4;
1887
1888 p3 = buff[i + 3]; p4 = buff[i + 4];
1889
1890 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1891 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1892 }
1893
1894 } else {
1895#ifdef __SUNPRO_C
1896#pragma pipeloop(0)
1897#endif /* __SUNPRO_C */
1898 for (i = 0; i <= (wid - 2); i += 2) {
1899 p0 = p2; p1 = p3; p2 = p4;
1900
1901 p3 = buff[i + 3]; p4 = buff[i + 4];
1902
1903 buffn[i ] = (FTYPE)sp[0];
1904 buffn[i + 1] = (FTYPE)sp[chan1];
1905
1906 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
1907 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1908
1909 dp[0 ] = FROM_S32(d0);
1910 dp[chan1] = FROM_S32(d1);
1911
1912 buffd[i ] = 0.0;
1913 buffd[i + 1] = 0.0;
1914
1915 sp += chan2;
1916 dp += chan2;
1917 }
1918 }
1919
1920 } else if (kw == 3) {
1921
1922 if (l < (n - 1) || off < m) {
1923#ifdef __SUNPRO_C
1924#pragma pipeloop(0)
1925#endif /* __SUNPRO_C */
1926 for (i = 0; i <= (wid - 2); i += 2) {
1927 p0 = p2; p1 = p3;
1928
1929 p2 = buff[i + 2]; p3 = buff[i + 3];
1930
1931 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
1932 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1933 }
1934
1935 } else {
1936#ifdef __SUNPRO_C
1937#pragma pipeloop(0)
1938#endif /* __SUNPRO_C */
1939 for (i = 0; i <= (wid - 2); i += 2) {
1940 p0 = p2; p1 = p3;
1941
1942 p2 = buff[i + 2]; p3 = buff[i + 3];
1943
1944 buffn[i ] = (FTYPE)sp[0];
1945 buffn[i + 1] = (FTYPE)sp[chan1];
1946
1947 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
1948 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1949
1950 dp[0 ] = FROM_S32(d0);
1951 dp[chan1] = FROM_S32(d1);
1952
1953 buffd[i ] = 0.0;
1954 buffd[i + 1] = 0.0;
1955
1956 sp += chan2;
1957 dp += chan2;
1958 }
1959 }
1960
1961 } else /*if (kw == 2)*/ {
1962
1963 if (l < (n - 1) || off < m) {
1964#ifdef __SUNPRO_C
1965#pragma pipeloop(0)
1966#endif /* __SUNPRO_C */
1967 for (i = 0; i <= (wid - 2); i += 2) {
1968 p0 = p2;
1969
1970 p1 = buff[i + 1]; p2 = buff[i + 2];
1971
1972 buffd[i ] += p0*k0 + p1*k1;
1973 buffd[i + 1] += p1*k0 + p2*k1;
1974 }
1975
1976 } else {
1977#ifdef __SUNPRO_C
1978#pragma pipeloop(0)
1979#endif /* __SUNPRO_C */
1980 for (i = 0; i <= (wid - 2); i += 2) {
1981 p0 = p2;
1982
1983 p1 = buff[i + 1]; p2 = buff[i + 2];
1984
1985 buffn[i ] = (FTYPE)sp[0];
1986 buffn[i + 1] = (FTYPE)sp[chan1];
1987
1988 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]);
1989 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
1990
1991 dp[0 ] = FROM_S32(d0);
1992 dp[chan1] = FROM_S32(d1);
1993
1994 buffd[i ] = 0.0;
1995 buffd[i + 1] = 0.0;
1996
1997 sp += chan2;
1998 dp += chan2;
1999 }
2000 }
2001 }
2002 }
2003 }
2004
2005 /* last pixels */
2006 for (; i < wid; i++) {
2007 FTYPE *pk = k, s = 0;
2008 mlib_s32 x, d0;
2009
2010 for (l = 0; l < n; l++) {
2011 FTYPE *buff = buffc[l] + i;
2012
2013 for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2014 }
2015
2016 d0 = D2I(s);
2017 dp[0] = FROM_S32(d0);
2018
2019 buffn[i] = (FTYPE)sp[0];
2020
2021 sp += chan1;
2022 dp += chan1;
2023 }
2024
2025 for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
2026
2027 /* next line */
2028 sl += sll;
2029 dl += dll;
2030
2031 buff_ind++;
2032
2033 if (buff_ind >= n + 1) buff_ind = 0;
2034 }
2035 }
2036
2037 if (pbuff != buff) mlib_free(pbuff);
2038
2039 return MLIB_SUCCESS;
2040}
2041
2042/***************************************************************/
2043#ifndef __sparc /* for x86, using integer multiplies is faster */
2044
2045#define STORE_RES(res, x) \
2046 x >>= shift2; \
2047 CLAMP_STORE(res, x)
2048
2049mlib_status CONV_FUNC_I(MxN)(mlib_image *dst,
2050 const mlib_image *src,
2051 const mlib_s32 *kernel,
2052 mlib_s32 m,
2053 mlib_s32 n,
2054 mlib_s32 dm,
2055 mlib_s32 dn,
2056 mlib_s32 scale,
2057 mlib_s32 cmask)
2058{
2059 mlib_s32 buff[BUFF_SIZE], *buffd = buff;
2060 mlib_s32 l, off, kw;
2061 mlib_s32 d0, d1, shift1, shift2;
2062 mlib_s32 k0, k1, k2, k3, k4, k5, k6;
2063 mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
2064 DTYPE *adr_src, *sl, *sp;
2065 DTYPE *adr_dst, *dl, *dp;
2066 mlib_s32 wid, hgt, sll, dll;
2067 mlib_s32 nchannel, chan1;
2068 mlib_s32 i, j, c;
2069 mlib_s32 chan2;
2070 mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
2071 GET_SRC_DST_PARAMETERS(DTYPE);
2072
2073#if IMG_TYPE != 1
2074 shift1 = 16;
2075#else
2076 shift1 = 8;
2077#endif /* IMG_TYPE != 1 */
2078 shift2 = scale - shift1;
2079
2080 chan1 = nchannel;
2081 chan2 = chan1 + chan1;
2082
2083 wid -= (m - 1);
2084 hgt -= (n - 1);
2085 adr_dst += dn*dll + dm*nchannel;
2086
2087 if (wid > BUFF_SIZE) {
2088 buffd = mlib_malloc(sizeof(mlib_s32)*wid);
2089
2090 if (buffd == NULL) return MLIB_FAILURE;
2091 }
2092
2093 if (m*n > MAX_N*MAX_N) {
2094 k = mlib_malloc(sizeof(mlib_s32)*(m*n));
2095
2096 if (k == NULL) {
2097 if (buffd != buff) mlib_free(buffd);
2098 return MLIB_FAILURE;
2099 }
2100 }
2101
2102 for (i = 0; i < m*n; i++) {
2103 k[i] = kernel[i] >> shift1;
2104 }
2105
2106 for (c = 0; c < nchannel; c++) {
2107 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
2108
2109 sl = adr_src + c;
2110 dl = adr_dst + c;
2111
2112#ifdef __SUNPRO_C
2113#pragma pipeloop(0)
2114#endif /* __SUNPRO_C */
2115 for (i = 0; i < wid; i++) buffd[i] = 0;
2116
2117 for (j = 0; j < hgt; j++) {
2118 mlib_s32 *pk = k;
2119
2120 for (l = 0; l < n; l++) {
2121 DTYPE *sp0 = sl + l*sll;
2122
2123 for (off = 0; off < m;) {
2124 sp = sp0 + off*chan1;
2125 dp = dl;
2126
2127 kw = m - off;
2128
2129 if (kw > 2*MAX_KER) kw = MAX_KER; else
2130 if (kw > MAX_KER) kw = kw/2;
2131 off += kw;
2132
2133 p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
2134 p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];
2135
2136 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2137 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
2138 pk += kw;
2139
2140 sp += (kw - 1)*chan1;
2141
2142 if (kw == 7) {
2143
2144 if (l < (n - 1) || off < m) {
2145#ifdef __SUNPRO_C
2146#pragma pipeloop(0)
2147#endif /* __SUNPRO_C */
2148 for (i = 0; i <= (wid - 2); i += 2) {
2149 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2150 p6 = sp[0];
2151 p7 = sp[chan1];
2152
2153 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2154 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2155
2156 sp += chan2;
2157 }
2158
2159 } else {
2160#ifdef __SUNPRO_C
2161#pragma pipeloop(0)
2162#endif /* __SUNPRO_C */
2163 for (i = 0; i <= (wid - 2); i += 2) {
2164 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2165 p6 = sp[0];
2166 p7 = sp[chan1];
2167
2168 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
2169 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2170
2171 STORE_RES(dp[0 ], d0);
2172 STORE_RES(dp[chan1], d1);
2173
2174 buffd[i ] = 0;
2175 buffd[i + 1] = 0;
2176
2177 sp += chan2;
2178 dp += chan2;
2179 }
2180 }
2181
2182 } else if (kw == 6) {
2183
2184 if (l < (n - 1) || off < m) {
2185#ifdef __SUNPRO_C
2186#pragma pipeloop(0)
2187#endif /* __SUNPRO_C */
2188 for (i = 0; i <= (wid - 2); i += 2) {
2189 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2190 p5 = sp[0];
2191 p6 = sp[chan1];
2192
2193 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2194 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2195
2196 sp += chan2;
2197 }
2198
2199 } else {
2200#ifdef __SUNPRO_C
2201#pragma pipeloop(0)
2202#endif /* __SUNPRO_C */
2203 for (i = 0; i <= (wid - 2); i += 2) {
2204 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2205 p5 = sp[0];
2206 p6 = sp[chan1];
2207
2208 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
2209 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2210
2211 STORE_RES(dp[0 ], d0);
2212 STORE_RES(dp[chan1], d1);
2213
2214 buffd[i ] = 0;
2215 buffd[i + 1] = 0;
2216
2217 sp += chan2;
2218 dp += chan2;
2219 }
2220 }
2221
2222 } else if (kw == 5) {
2223
2224 if (l < (n - 1) || off < m) {
2225#ifdef __SUNPRO_C
2226#pragma pipeloop(0)
2227#endif /* __SUNPRO_C */
2228 for (i = 0; i <= (wid - 2); i += 2) {
2229 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2230 p4 = sp[0];
2231 p5 = sp[chan1];
2232
2233 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2234 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2235
2236 sp += chan2;
2237 }
2238
2239 } else {
2240#ifdef __SUNPRO_C
2241#pragma pipeloop(0)
2242#endif /* __SUNPRO_C */
2243 for (i = 0; i <= (wid - 2); i += 2) {
2244 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2245 p4 = sp[0];
2246 p5 = sp[chan1];
2247
2248 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
2249 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2250
2251 STORE_RES(dp[0 ], d0);
2252 STORE_RES(dp[chan1], d1);
2253
2254 buffd[i ] = 0;
2255 buffd[i + 1] = 0;
2256
2257 sp += chan2;
2258 dp += chan2;
2259 }
2260 }
2261
2262 } else if (kw == 4) {
2263
2264 if (l < (n - 1) || off < m) {
2265#ifdef __SUNPRO_C
2266#pragma pipeloop(0)
2267#endif /* __SUNPRO_C */
2268 for (i = 0; i <= (wid - 2); i += 2) {
2269 p0 = p2; p1 = p3; p2 = p4;
2270 p3 = sp[0];
2271 p4 = sp[chan1];
2272
2273 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2274 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2275
2276 sp += chan2;
2277 }
2278
2279 } else {
2280#ifdef __SUNPRO_C
2281#pragma pipeloop(0)
2282#endif /* __SUNPRO_C */
2283 for (i = 0; i <= (wid - 2); i += 2) {
2284 p0 = p2; p1 = p3; p2 = p4;
2285 p3 = sp[0];
2286 p4 = sp[chan1];
2287
2288 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
2289 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2290
2291 STORE_RES(dp[0 ], d0);
2292 STORE_RES(dp[chan1], d1);
2293
2294 buffd[i ] = 0;
2295 buffd[i + 1] = 0;
2296
2297 sp += chan2;
2298 dp += chan2;
2299 }
2300 }
2301
2302 } else if (kw == 3) {
2303
2304 if (l < (n - 1) || off < m) {
2305#ifdef __SUNPRO_C
2306#pragma pipeloop(0)
2307#endif /* __SUNPRO_C */
2308 for (i = 0; i <= (wid - 2); i += 2) {
2309 p0 = p2; p1 = p3;
2310 p2 = sp[0];
2311 p3 = sp[chan1];
2312
2313 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
2314 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2315
2316 sp += chan2;
2317 }
2318
2319 } else {
2320#ifdef __SUNPRO_C
2321#pragma pipeloop(0)
2322#endif /* __SUNPRO_C */
2323 for (i = 0; i <= (wid - 2); i += 2) {
2324 p0 = p2; p1 = p3;
2325 p2 = sp[0];
2326 p3 = sp[chan1];
2327
2328 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
2329 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2330
2331 STORE_RES(dp[0 ], d0);
2332 STORE_RES(dp[chan1], d1);
2333
2334 buffd[i ] = 0;
2335 buffd[i + 1] = 0;
2336
2337 sp += chan2;
2338 dp += chan2;
2339 }
2340 }
2341
2342 } else if (kw == 2) {
2343
2344 if (l < (n - 1) || off < m) {
2345#ifdef __SUNPRO_C
2346#pragma pipeloop(0)
2347#endif /* __SUNPRO_C */
2348 for (i = 0; i <= (wid - 2); i += 2) {
2349 p0 = p2;
2350 p1 = sp[0];
2351 p2 = sp[chan1];
2352
2353 buffd[i ] += p0*k0 + p1*k1;
2354 buffd[i + 1] += p1*k0 + p2*k1;
2355
2356 sp += chan2;
2357 }
2358
2359 } else {
2360#ifdef __SUNPRO_C
2361#pragma pipeloop(0)
2362#endif /* __SUNPRO_C */
2363 for (i = 0; i <= (wid - 2); i += 2) {
2364 p0 = p2;
2365 p1 = sp[0];
2366 p2 = sp[chan1];
2367
2368 d0 = (p0*k0 + p1*k1 + buffd[i ]);
2369 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
2370
2371 STORE_RES(dp[0 ], d0);
2372 STORE_RES(dp[chan1], d1);
2373
2374 buffd[i ] = 0;
2375 buffd[i + 1] = 0;
2376
2377 sp += chan2;
2378 dp += chan2;
2379 }
2380 }
2381
2382 } else /*if (kw == 1)*/ {
2383
2384 if (l < (n - 1) || off < m) {
2385#ifdef __SUNPRO_C
2386#pragma pipeloop(0)
2387#endif /* __SUNPRO_C */
2388 for (i = 0; i <= (wid - 2); i += 2) {
2389 p0 = sp[0];
2390 p1 = sp[chan1];
2391
2392 buffd[i ] += p0*k0;
2393 buffd[i + 1] += p1*k0;
2394
2395 sp += chan2;
2396 }
2397
2398 } else {
2399#ifdef __SUNPRO_C
2400#pragma pipeloop(0)
2401#endif /* __SUNPRO_C */
2402 for (i = 0; i <= (wid - 2); i += 2) {
2403 p0 = sp[0];
2404 p1 = sp[chan1];
2405
2406 d0 = (p0*k0 + buffd[i ]);
2407 d1 = (p1*k0 + buffd[i + 1]);
2408
2409 STORE_RES(dp[0 ], d0);
2410 STORE_RES(dp[chan1], d1);
2411
2412 buffd[i ] = 0;
2413 buffd[i + 1] = 0;
2414
2415 sp += chan2;
2416 dp += chan2;
2417 }
2418 }
2419 }
2420 }
2421 }
2422
2423 /* last pixels */
2424 for (; i < wid; i++) {
2425 mlib_s32 *pk = k, s = 0;
2426 mlib_s32 x;
2427
2428 for (l = 0; l < n; l++) {
2429 sp = sl + l*sll + i*chan1;
2430
2431 for (x = 0; x < m; x++) {
2432 s += sp[0] * pk[0];
2433 sp += chan1;
2434 pk ++;
2435 }
2436 }
2437
2438 STORE_RES(dp[0], s);
2439
2440 sp += chan1;
2441 dp += chan1;
2442 }
2443
2444 sl += sll;
2445 dl += dll;
2446 }
2447 }
2448
2449 if (buffd != buff) mlib_free(buffd);
2450 if (k != k_locl) mlib_free(k);
2451
2452 return MLIB_SUCCESS;
2453}
2454
2455/***************************************************************/
2456#endif /* __sparc ( for x86, using integer multiplies is faster ) */
2457
2458/***************************************************************/