blob: 5402e5df39a7d536cb50263439ba25a75b7733d5 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2000-2003 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26
27/*
28 * FUNCTION
29 * Internal functions for mlib_ImageConv* on U8/S16/U16 types and
30 * MLIB_EDGE_DST_NO_WRITE mask
31 */
32
33#include "mlib_image.h"
34#include "mlib_c_ImageConv.h"
35
36/*
37 This define switches between functions of different data types
38*/
39#define IMG_TYPE 2
40
41/***************************************************************/
42#if IMG_TYPE == 1
43
44#define DTYPE mlib_u8
45#define CONV_FUNC(KERN) mlib_c_conv##KERN##nw_u8
46#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8
47#define DSCALE (1 << 24)
48#define FROM_S32(x) (((x) >> 24) ^ 128)
49#define S64TOS32(x) (x)
50#define SAT_OFF -(1u << 31)
51
52#elif IMG_TYPE == 2
53
54#define DTYPE mlib_s16
55#define CONV_FUNC(KERN) mlib_conv##KERN##nw_s16
56#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16
57#define DSCALE 65536.0
58#define FROM_S32(x) ((x) >> 16)
59#define S64TOS32(x) ((x) & 0xffffffff)
60#define SAT_OFF
61
62#elif IMG_TYPE == 3
63
64#define DTYPE mlib_u16
65#define CONV_FUNC(KERN) mlib_conv##KERN##nw_u16
66#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16
67#define DSCALE 65536.0
68#define FROM_S32(x) (((x) >> 16) ^ 0x8000)
69#define S64TOS32(x) (x)
70#define SAT_OFF -(1u << 31)
71
72#endif /* IMG_TYPE == 1 */
73
74/***************************************************************/
75#define BUFF_SIZE 1600
76
77#define CACHE_SIZE (64*1024)
78
79/***************************************************************/
80#define FTYPE mlib_d64
81
82#ifndef MLIB_USE_FTOI_CLAMPING
83
84#define CLAMP_S32(x) \
85 (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
86
87#else
88
89#define CLAMP_S32(x) ((mlib_s32)(x))
90
91#endif /* MLIB_USE_FTOI_CLAMPING */
92
93/***************************************************************/
94#define D2I(x) CLAMP_S32((x) SAT_OFF)
95
96/***************************************************************/
97#ifdef _LITTLE_ENDIAN
98
99#define STORE2(res0, res1) \
100 dp[0 ] = res1; \
101 dp[chan1] = res0
102
103#else
104
105#define STORE2(res0, res1) \
106 dp[0 ] = res0; \
107 dp[chan1] = res1
108
109#endif /* _LITTLE_ENDIAN */
110
111/***************************************************************/
112#ifdef _NO_LONGLONG
113
114#define LOAD_BUFF(buff) \
115 buff[i ] = sp[0]; \
116 buff[i + 1] = sp[chan1]
117
118#else /* _NO_LONGLONG */
119
120#ifdef _LITTLE_ENDIAN
121
122#define LOAD_BUFF(buff) \
123 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
124
125#else /* _LITTLE_ENDIAN */
126
127#define LOAD_BUFF(buff) \
128 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
129
130#endif /* _LITTLE_ENDIAN */
131#endif /* _NO_LONGLONG */
132
133/***************************************************************/
134typedef union {
135 mlib_d64 d64;
136 struct {
137 mlib_s32 i0;
138 mlib_s32 i1;
139 } i32s;
140 struct {
141 mlib_s32 f0;
142 mlib_s32 f1;
143 } f32s;
144} d64_2x32;
145
146/***************************************************************/
147#define BUFF_LINE 256
148
149/***************************************************************/
150#define DEF_VARS(type) \
151 type *adr_src, *sl, *sp; \
152 type *adr_dst, *dl, *dp; \
153 FTYPE *pbuff = buff; \
154 mlib_s32 wid, hgt, sll, dll; \
155 mlib_s32 nchannel, chan1; \
156 mlib_s32 i, j, c
157
158/***************************************************************/
159#define LOAD_KERNEL3() \
160 FTYPE scalef = DSCALE; \
161 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8; \
162 FTYPE p00, p01, p02, p03, \
163 p10, p11, p12, p13, \
164 p20, p21, p22, p23; \
165 \
166 while (scalef_expon > 30) { \
167 scalef /= (1 << 30); \
168 scalef_expon -= 30; \
169 } \
170 \
171 scalef /= (1 << scalef_expon); \
172 \
173 /* keep kernel in regs */ \
174 k0 = scalef * kern[0]; k1 = scalef * kern[1]; k2 = scalef * kern[2]; \
175 k3 = scalef * kern[3]; k4 = scalef * kern[4]; k5 = scalef * kern[5]; \
176 k6 = scalef * kern[6]; k7 = scalef * kern[7]; k8 = scalef * kern[8]
177
178/***************************************************************/
179#define LOAD_KERNEL(SIZE) \
180 FTYPE scalef = DSCALE; \
181 \
182 while (scalef_expon > 30) { \
183 scalef /= (1 << 30); \
184 scalef_expon -= 30; \
185 } \
186 \
187 scalef /= (1 << scalef_expon); \
188 \
189 for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
190
191/***************************************************************/
192#define GET_SRC_DST_PARAMETERS(type) \
193 hgt = mlib_ImageGetHeight(src); \
194 wid = mlib_ImageGetWidth(src); \
195 nchannel = mlib_ImageGetChannels(src); \
196 sll = mlib_ImageGetStride(src) / sizeof(type); \
197 dll = mlib_ImageGetStride(dst) / sizeof(type); \
198 adr_src = (type *)mlib_ImageGetData(src); \
199 adr_dst = (type *)mlib_ImageGetData(dst)
200
201/***************************************************************/
202#ifndef __sparc
203
204#if IMG_TYPE == 1
205
206/* Test for the presence of any "1" bit in bits
207 8 to 31 of val. If present, then val is either
208 negative or >255. If over/underflows of 8 bits
209 are uncommon, then this technique can be a win,
210 since only a single test, rather than two, is
211 necessary to determine if clamping is needed.
212 On the other hand, if over/underflows are common,
213 it adds an extra test.
214*/
215#define CLAMP_STORE(dst, val) \
216 if (val & 0xffffff00) { \
217 if (val < MLIB_U8_MIN) \
218 dst = MLIB_U8_MIN; \
219 else \
220 dst = MLIB_U8_MAX; \
221 } else { \
222 dst = (mlib_u8)val; \
223 }
224
225#elif IMG_TYPE == 2
226
227#define CLAMP_STORE(dst, val) \
228 if (val >= MLIB_S16_MAX) \
229 dst = MLIB_S16_MAX; \
230 else if (val <= MLIB_S16_MIN) \
231 dst = MLIB_S16_MIN; \
232 else \
233 dst = (mlib_s16)val
234
235#elif IMG_TYPE == 3
236
237#define CLAMP_STORE(dst, val) \
238 if (val >= MLIB_U16_MAX) \
239 dst = MLIB_U16_MAX; \
240 else if (val <= MLIB_U16_MIN) \
241 dst = MLIB_U16_MIN; \
242 else \
243 dst = (mlib_u16)val
244
245#endif /* IMG_TYPE == 1 */
246#endif /* __sparc */
247
248/***************************************************************/
249#define KSIZE 3
250
251mlib_status CONV_FUNC(3x3)(mlib_image *dst,
252 const mlib_image *src,
253 const mlib_s32 *kern,
254 mlib_s32 scalef_expon,
255 mlib_s32 cmask)
256{
257 FTYPE buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
258 DEF_VARS(DTYPE);
259 DTYPE *sl1;
260 mlib_s32 chan2;
261 mlib_s32 *buffo, *buffi;
262 DTYPE *sl2;
263#ifndef __sparc
264 mlib_s32 d0, d1;
265#endif /* __sparc */
266 LOAD_KERNEL3();
267 GET_SRC_DST_PARAMETERS(DTYPE);
268
269 if (wid > BUFF_LINE) {
270 pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE)*wid);
271
272 if (pbuff == NULL) return MLIB_FAILURE;
273 }
274
275 buff0 = pbuff;
276 buff1 = buff0 + wid;
277 buff2 = buff1 + wid;
278 buff3 = buff2 + wid;
279 buffo = (mlib_s32*)(buff3 + wid);
280 buffi = buffo + (wid &~ 1);
281
282 chan1 = nchannel;
283 chan2 = chan1 + chan1;
284
285 wid -= (KSIZE - 1);
286 hgt -= (KSIZE - 1);
287
288 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
289
290 for (c = 0; c < nchannel; c++) {
291 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
292
293 sl = adr_src + c;
294 dl = adr_dst + c;
295
296 sl1 = sl + sll;
297 sl2 = sl1 + sll;
298#ifdef __SUNPRO_C
299#pragma pipeloop(0)
300#endif /* __SUNPRO_C */
301 for (i = 0; i < wid + (KSIZE - 1); i++) {
302 buff0[i] = (FTYPE)sl[i*chan1];
303 buff1[i] = (FTYPE)sl1[i*chan1];
304 buff2[i] = (FTYPE)sl2[i*chan1];
305 }
306
307 sl += KSIZE*sll;
308
309 for (j = 0; j < hgt; j++) {
310 FTYPE s0, s1;
311
312 p02 = buff0[0];
313 p12 = buff1[0];
314 p22 = buff2[0];
315
316 p03 = buff0[1];
317 p13 = buff1[1];
318 p23 = buff2[1];
319
320 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
321 s1 = p03 * k0 + p13 * k3 + p23 * k6;
322
323 sp = sl;
324 dp = dl;
325
326#ifdef __SUNPRO_C
327#pragma pipeloop(0)
328#endif /* __SUNPRO_C */
329 for (i = 0; i <= (wid - 2); i += 2) {
330#ifdef __sparc
331#ifdef _NO_LONGLONG
332 mlib_s32 o64_1, o64_2;
333#else /* _NO_LONGLONG */
334 mlib_s64 o64;
335#endif /* _NO_LONGLONG */
336#endif /* __sparc */
337 d64_2x32 dd;
338
339 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
340 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
341
342 LOAD_BUFF(buffi);
343
344 dd.d64 = *(FTYPE *)(buffi + i);
345 buff3[i ] = (FTYPE)dd.i32s.i0;
346 buff3[i + 1] = (FTYPE)dd.i32s.i1;
347
348#ifndef __sparc
349 d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
350 d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
351
352 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
353 s1 = p03 * k0 + p13 * k3 + p23 * k6;
354
355 dp[0 ] = FROM_S32(d0);
356 dp[chan1] = FROM_S32(d1);
357
358#else /* __sparc */
359
360 dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
361 dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
362 *(FTYPE *)(buffo + i) = dd.d64;
363
364 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
365 s1 = p03 * k0 + p13 * k3 + p23 * k6;
366
367#ifdef _NO_LONGLONG
368
369 o64_1 = buffo[i];
370 o64_2 = buffo[i+1];
371#if IMG_TYPE != 1
372 STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
373#else
374 STORE2(o64_1 >> 24, o64_2 >> 24);
375#endif /* IMG_TYPE != 1 */
376
377#else /* _NO_LONGLONG */
378
379 o64 = *(mlib_s64*)(buffo + i);
380#if IMG_TYPE != 1
381 STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
382#else
383 STORE2(o64 >> 56, o64 >> 24);
384#endif /* IMG_TYPE != 1 */
385#endif /* _NO_LONGLONG */
386#endif /* __sparc */
387
388 sp += chan2;
389 dp += chan2;
390 }
391
392 for (; i < wid; i++) {
393 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i];
394 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
395 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
396
397 buffi[i] = (mlib_s32)sp[0];
398 buff3[i] = (FTYPE)buffi[i];
399
400#ifndef __sparc
401
402 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
403 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
404
405 dp[0] = FROM_S32(d0);
406
407#else /* __sparc */
408
409 buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
410 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
411#if IMG_TYPE != 1
412 dp[0] = FROM_S32(buffo[i]);
413#else
414 dp[0] = buffo[i] >> 24;
415#endif /* IMG_TYPE != 1 */
416#endif /* __sparc */
417
418 sp += chan1;
419 dp += chan1;
420 }
421
422 buffi[wid] = (mlib_s32)sp[0];
423 buff3[wid] = (FTYPE)buffi[wid];
424 buffi[wid + 1] = (mlib_s32)sp[chan1];
425 buff3[wid + 1] = (FTYPE)buffi[wid + 1];
426
427 sl += sll;
428 dl += dll;
429
430 buffT = buff0;
431 buff0 = buff1;
432 buff1 = buff2;
433 buff2 = buff3;
434 buff3 = buffT;
435 }
436 }
437
438#ifdef __sparc
439#if IMG_TYPE == 1
440 {
441 mlib_s32 amask = (1 << nchannel) - 1;
442
443 if ((cmask & amask) != amask) {
444 mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
445 } else {
446 mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
447 }
448 }
449
450#endif /* IMG_TYPE == 1 */
451#endif /* __sparc */
452
453 if (pbuff != buff) mlib_free(pbuff);
454
455 return MLIB_SUCCESS;
456}
457
458/***************************************************************/
459#ifndef __sparc /* for x86, using integer multiplies is faster */
460
461mlib_status CONV_FUNC_I(3x3)(mlib_image *dst,
462 const mlib_image *src,
463 const mlib_s32 *kern,
464 mlib_s32 scalef_expon,
465 mlib_s32 cmask)
466{
467 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2;
468 DTYPE *adr_dst, *dl, *dp;
469 mlib_s32 wid, hgt, sll, dll;
470 mlib_s32 nchannel, chan1, chan2;
471 mlib_s32 i, j, c;
472 mlib_s32 shift1, shift2;
473 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
474 mlib_s32 p02, p03,
475 p12, p13,
476 p22, p23;
477
478#if IMG_TYPE != 1
479 shift1 = 16;
480#else
481 shift1 = 8;
482#endif /* IMG_TYPE != 1 */
483
484 shift2 = scalef_expon - shift1;
485
486 /* keep kernel in regs */
487 k0 = kern[0] >> shift1; k1 = kern[1] >> shift1; k2 = kern[2] >> shift1;
488 k3 = kern[3] >> shift1; k4 = kern[4] >> shift1; k5 = kern[5] >> shift1;
489 k6 = kern[6] >> shift1; k7 = kern[7] >> shift1; k8 = kern[8] >> shift1;
490
491 GET_SRC_DST_PARAMETERS(DTYPE);
492
493 chan1 = nchannel;
494 chan2 = chan1 + chan1;
495
496 wid -= (KSIZE - 1);
497 hgt -= (KSIZE - 1);
498
499 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
500
501 for (c = 0; c < chan1; c++) {
502 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
503
504 sl = adr_src + c;
505 dl = adr_dst + c;
506
507 for (j = 0; j < hgt; j++) {
508 mlib_s32 s0, s1;
509 mlib_s32 pix0, pix1;
510
511 dp = dl;
512 sp0 = sl;
513 sp1 = sp0 + sll;
514 sp2 = sp1 + sll;
515
516 p02 = sp0[0];
517 p12 = sp1[0];
518 p22 = sp2[0];
519
520 p03 = sp0[chan1];
521 p13 = sp1[chan1];
522 p23 = sp2[chan1];
523
524 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
525 s1 = p03 * k0 + p13 * k3 + p23 * k6;
526
527 sp0 += chan2;
528 sp1 += chan2;
529 sp2 += chan2;
530
531#ifdef __SUNPRO_C
532#pragma pipeloop(0)
533#endif /* __SUNPRO_C */
534 for (i = 0; i <= (wid - 2); i += 2) {
535 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
536 p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
537
538 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
539 pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
540 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
541
542 CLAMP_STORE(dp[0], pix0);
543 CLAMP_STORE(dp[chan1], pix1);
544
545 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
546 s1 = p03 * k0 + p13 * k3 + p23 * k6;
547
548 sp0 += chan2;
549 sp1 += chan2;
550 sp2 += chan2;
551 dp += chan2;
552 }
553
554 if (wid & 1) {
555 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
556 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
557 CLAMP_STORE(dp[0], pix0);
558 }
559
560 sl += sll;
561 dl += dll;
562 }
563 }
564
565 return MLIB_SUCCESS;
566}
567
568#endif /* __sparc ( for x86, using integer multiplies is faster ) */
569
570/***************************************************************/
571#undef KSIZE
572#define KSIZE 4
573
574mlib_status CONV_FUNC(4x4)(mlib_image *dst,
575 const mlib_image *src,
576 const mlib_s32 *kern,
577 mlib_s32 scalef_expon,
578 mlib_s32 cmask)
579{
580 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
581 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
582 FTYPE k[KSIZE*KSIZE];
583 mlib_s32 d0, d1;
584 FTYPE k0, k1, k2, k3, k4, k5, k6, k7;
585 FTYPE p00, p01, p02, p03, p04,
586 p10, p11, p12, p13, p14,
587 p20, p21, p22, p23,
588 p30, p31, p32, p33;
589 DEF_VARS(DTYPE);
590 DTYPE *sl1;
591 mlib_s32 chan2;
592 mlib_s32 *buffo, *buffi;
593 DTYPE *sl2, *sl3;
594 LOAD_KERNEL(KSIZE*KSIZE);
595 GET_SRC_DST_PARAMETERS(DTYPE);
596
597 if (wid > BUFF_LINE) {
598 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
599
600 if (pbuff == NULL) return MLIB_FAILURE;
601 }
602
603 buff0 = pbuff;
604 buff1 = buff0 + wid;
605 buff2 = buff1 + wid;
606 buff3 = buff2 + wid;
607 buff4 = buff3 + wid;
608 buffd = buff4 + wid;
609 buffo = (mlib_s32*)(buffd + wid);
610 buffi = buffo + (wid &~ 1);
611
612 chan1 = nchannel;
613 chan2 = chan1 + chan1;
614
615 wid -= (KSIZE - 1);
616 hgt -= (KSIZE - 1);
617
618 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
619
620 for (c = 0; c < nchannel; c++) {
621 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
622
623 sl = adr_src + c;
624 dl = adr_dst + c;
625
626 sl1 = sl + sll;
627 sl2 = sl1 + sll;
628 sl3 = sl2 + sll;
629#ifdef __SUNPRO_C
630#pragma pipeloop(0)
631#endif /* __SUNPRO_C */
632 for (i = 0; i < wid + (KSIZE - 1); i++) {
633 buff0[i] = (FTYPE)sl[i*chan1];
634 buff1[i] = (FTYPE)sl1[i*chan1];
635 buff2[i] = (FTYPE)sl2[i*chan1];
636 buff3[i] = (FTYPE)sl3[i*chan1];
637 }
638
639 sl += KSIZE*sll;
640
641 for (j = 0; j < hgt; j++) {
642 d64_2x32 dd;
643
644 /*
645 * First loop on two first lines of kernel
646 */
647 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
648 k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
649
650 sp = sl;
651 dp = dl;
652
653 p02 = buff0[0];
654 p12 = buff1[0];
655 p03 = buff0[1];
656 p13 = buff1[1];
657 p04 = buff0[2];
658
659#ifdef __SUNPRO_C
660#pragma pipeloop(0)
661#endif /* __SUNPRO_C */
662 for (i = 0; i <= (wid - 2); i += 2) {
663 p00 = p02; p10 = p12;
664 p01 = p03; p11 = p13;
665 p02 = p04; p12 = buff1[i + 2];
666 p03 = buff0[i + 3]; p13 = buff1[i + 3];
667 p04 = buff0[i + 4]; p14 = buff1[i + 4];
668
669 LOAD_BUFF(buffi);
670
671 dd.d64 = *(FTYPE *)(buffi + i);
672 buff4[i ] = (FTYPE)dd.i32s.i0;
673 buff4[i + 1] = (FTYPE)dd.i32s.i1;
674
675 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
676 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
677 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
678 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
679
680 sp += chan2;
681 dp += chan2;
682 }
683
684 /*
685 * Second loop on two last lines of kernel
686 */
687 k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
688 k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
689
690 sp = sl;
691 dp = dl;
692
693 p02 = buff2[0];
694 p12 = buff3[0];
695 p03 = buff2[1];
696 p13 = buff3[1];
697 p04 = buff2[2];
698
699#ifdef __SUNPRO_C
700#pragma pipeloop(0)
701#endif /* __SUNPRO_C */
702 for (i = 0; i <= (wid - 2); i += 2) {
703 p00 = p02; p10 = p12;
704 p01 = p03; p11 = p13;
705 p02 = p04; p12 = buff3[i + 2];
706 p03 = buff2[i + 3]; p13 = buff3[i + 3];
707 p04 = buff2[i + 4]; p14 = buff3[i + 4];
708
709 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
710 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
711 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
712 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
713
714 dp[0 ] = FROM_S32(d0);
715 dp[chan1] = FROM_S32(d1);
716
717 sp += chan2;
718 dp += chan2;
719 }
720
721 /* last pixels */
722 for (; i < wid; i++) {
723 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
724 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
725 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
726 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
727
728 buff4[i] = (FTYPE)sp[0];
729
730 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
731 p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
732 p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
733 p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
734
735 dp[0] = FROM_S32(buffo[i]);
736
737 sp += chan1;
738 dp += chan1;
739 }
740
741 buff4[wid ] = (FTYPE)sp[0];
742 buff4[wid + 1] = (FTYPE)sp[chan1];
743 buff4[wid + 2] = (FTYPE)sp[chan2];
744
745 /* next line */
746 sl += sll;
747 dl += dll;
748
749 buffT = buff0;
750 buff0 = buff1;
751 buff1 = buff2;
752 buff2 = buff3;
753 buff3 = buff4;
754 buff4 = buffT;
755 }
756 }
757
758 if (pbuff != buff) mlib_free(pbuff);
759
760 return MLIB_SUCCESS;
761}
762
763/***************************************************************/
764#undef KSIZE
765#define KSIZE 5
766
767mlib_status CONV_FUNC(5x5)(mlib_image *dst,
768 const mlib_image *src,
769 const mlib_s32 *kern,
770 mlib_s32 scalef_expon,
771 mlib_s32 cmask)
772{
773 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
774 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
775 FTYPE k[KSIZE*KSIZE];
776 mlib_s32 d0, d1;
777 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
778 FTYPE p00, p01, p02, p03, p04, p05,
779 p10, p11, p12, p13, p14, p15,
780 p20, p21, p22, p23, p24,
781 p30, p31, p32, p33, p34,
782 p40, p41, p42, p43, p44;
783 DEF_VARS(DTYPE);
784 DTYPE *sl1;
785 mlib_s32 chan2;
786 mlib_s32 *buffo, *buffi;
787 DTYPE *sl2, *sl3, *sl4;
788 LOAD_KERNEL(KSIZE*KSIZE);
789 GET_SRC_DST_PARAMETERS(DTYPE);
790
791 if (wid > BUFF_LINE) {
792 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
793
794 if (pbuff == NULL) return MLIB_FAILURE;
795 }
796
797 buff0 = pbuff;
798 buff1 = buff0 + wid;
799 buff2 = buff1 + wid;
800 buff3 = buff2 + wid;
801 buff4 = buff3 + wid;
802 buff5 = buff4 + wid;
803 buffd = buff5 + wid;
804 buffo = (mlib_s32*)(buffd + wid);
805 buffi = buffo + (wid &~ 1);
806
807 chan1 = nchannel;
808 chan2 = chan1 + chan1;
809
810 wid -= (KSIZE - 1);
811 hgt -= (KSIZE - 1);
812
813 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
814
815 for (c = 0; c < nchannel; c++) {
816 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
817
818 sl = adr_src + c;
819 dl = adr_dst + c;
820
821 sl1 = sl + sll;
822 sl2 = sl1 + sll;
823 sl3 = sl2 + sll;
824 sl4 = sl3 + sll;
825#ifdef __SUNPRO_C
826#pragma pipeloop(0)
827#endif /* __SUNPRO_C */
828 for (i = 0; i < wid + (KSIZE - 1); i++) {
829 buff0[i] = (FTYPE)sl[i*chan1];
830 buff1[i] = (FTYPE)sl1[i*chan1];
831 buff2[i] = (FTYPE)sl2[i*chan1];
832 buff3[i] = (FTYPE)sl3[i*chan1];
833 buff4[i] = (FTYPE)sl4[i*chan1];
834 }
835
836 sl += KSIZE*sll;
837
838 for (j = 0; j < hgt; j++) {
839 d64_2x32 dd;
840
841 /*
842 * First loop
843 */
844 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
845 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
846
847 sp = sl;
848 dp = dl;
849
850 p02 = buff0[0];
851 p12 = buff1[0];
852 p03 = buff0[1];
853 p13 = buff1[1];
854 p04 = buff0[2];
855 p14 = buff1[2];
856
857#ifdef __SUNPRO_C
858#pragma pipeloop(0)
859#endif /* __SUNPRO_C */
860 for (i = 0; i <= (wid - 2); i += 2) {
861 p00 = p02; p10 = p12;
862 p01 = p03; p11 = p13;
863 p02 = p04; p12 = p14;
864
865 LOAD_BUFF(buffi);
866
867 p03 = buff0[i + 3]; p13 = buff1[i + 3];
868 p04 = buff0[i + 4]; p14 = buff1[i + 4];
869 p05 = buff0[i + 5]; p15 = buff1[i + 5];
870
871 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
872 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
873 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
874 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
875
876 sp += chan2;
877 dp += chan2;
878 }
879
880 /*
881 * Second loop
882 */
883 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
884 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
885
886 sp = sl;
887 dp = dl;
888
889 p02 = buff2[0];
890 p12 = buff3[0];
891 p03 = buff2[1];
892 p13 = buff3[1];
893 p04 = buff2[2];
894 p14 = buff3[2];
895
896#ifdef __SUNPRO_C
897#pragma pipeloop(0)
898#endif /* __SUNPRO_C */
899 for (i = 0; i <= (wid - 2); i += 2) {
900 p00 = p02; p10 = p12;
901 p01 = p03; p11 = p13;
902
903 p02 = buff2[i + 2]; p12 = buff3[i + 2];
904 p03 = buff2[i + 3]; p13 = buff3[i + 3];
905 p04 = buff2[i + 4]; p14 = buff3[i + 4];
906 p05 = buff2[i + 5]; p15 = buff3[i + 5];
907
908 dd.d64 = *(FTYPE *)(buffi + i);
909 buff5[i ] = (FTYPE)dd.i32s.i0;
910 buff5[i + 1] = (FTYPE)dd.i32s.i1;
911
912 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
913 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
914 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
915 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
916
917 sp += chan2;
918 dp += chan2;
919 }
920
921 /*
922 * 3 loop
923 */
924 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
925
926 sp = sl;
927 dp = dl;
928
929 p02 = buff4[0];
930 p03 = buff4[1];
931 p04 = buff4[2];
932 p05 = buff4[3];
933
934#ifdef __SUNPRO_C
935#pragma pipeloop(0)
936#endif /* __SUNPRO_C */
937 for (i = 0; i <= (wid - 2); i += 2) {
938 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
939
940 p04 = buff4[i + 4]; p05 = buff4[i + 5];
941
942 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
943 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
944
945 dp[0 ] = FROM_S32(d0);
946 dp[chan1] = FROM_S32(d1);
947
948 sp += chan2;
949 dp += chan2;
950 }
951
952 /* last pixels */
953 for (; i < wid; i++) {
954 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
955 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
956 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
957 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
958 p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
959
960 p40 = buff4[i]; p41 = buff4[i + 1]; p42 = buff4[i + 2];
961 p43 = buff4[i + 3]; p44 = buff4[i + 4];
962
963 buff5[i] = (FTYPE)sp[0];
964
965 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
966 p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
967 p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
968 p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
969 p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
970
971 dp[0] = FROM_S32(buffo[i]);
972
973 sp += chan1;
974 dp += chan1;
975 }
976
977 buff5[wid ] = (FTYPE)sp[0];
978 buff5[wid + 1] = (FTYPE)sp[chan1];
979 buff5[wid + 2] = (FTYPE)sp[chan2];
980 buff5[wid + 3] = (FTYPE)sp[chan2 + chan1];
981
982 /* next line */
983 sl += sll;
984 dl += dll;
985
986 buffT = buff0;
987 buff0 = buff1;
988 buff1 = buff2;
989 buff2 = buff3;
990 buff3 = buff4;
991 buff4 = buff5;
992 buff5 = buffT;
993 }
994 }
995
996 if (pbuff != buff) mlib_free(pbuff);
997
998 return MLIB_SUCCESS;
999}
1000
1001/***************************************************************/
1002#ifndef __sparc /* for x86, using integer multiplies is faster */
1003
1004mlib_status CONV_FUNC_I(5x5)(mlib_image *dst,
1005 const mlib_image *src,
1006 const mlib_s32 *kern,
1007 mlib_s32 scalef_expon,
1008 mlib_s32 cmask)
1009{
1010 mlib_s32 buff[BUFF_LINE];
1011 mlib_s32 *buffd;
1012 mlib_s32 k[KSIZE*KSIZE];
1013 mlib_s32 shift1, shift2;
1014 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1015 mlib_s32 p00, p01, p02, p03, p04, p05,
1016 p10, p11, p12, p13, p14, p15;
1017 DTYPE *adr_src, *sl, *sp0, *sp1;
1018 DTYPE *adr_dst, *dl, *dp;
1019 mlib_s32 *pbuff = buff;
1020 mlib_s32 wid, hgt, sll, dll;
1021 mlib_s32 nchannel, chan1, chan2, chan3, chan4;
1022 mlib_s32 i, j, c;
1023
1024#if IMG_TYPE != 1
1025 shift1 = 16;
1026#else
1027 shift1 = 8;
1028#endif /* IMG_TYPE != 1 */
1029
1030 shift2 = scalef_expon - shift1;
1031
1032 for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1033
1034 GET_SRC_DST_PARAMETERS(DTYPE);
1035
1036 if (wid > BUFF_LINE) {
1037 pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1038
1039 if (pbuff == NULL) return MLIB_FAILURE;
1040 }
1041
1042 buffd = pbuff;
1043
1044 chan1 = nchannel;
1045 chan2 = chan1 + chan1;
1046 chan3 = chan2 + chan1;
1047 chan4 = chan3 + chan1;
1048
1049 wid -= (KSIZE - 1);
1050 hgt -= (KSIZE - 1);
1051
1052 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1053
1054 for (c = 0; c < chan1; c++) {
1055 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1056
1057 sl = adr_src + c;
1058 dl = adr_dst + c;
1059
1060 for (j = 0; j < hgt; j++) {
1061 mlib_s32 pix0, pix1;
1062 /*
1063 * First loop
1064 */
1065 sp0 = sl;
1066 sp1 = sp0 + sll;
1067 dp = dl;
1068
1069 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1070 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1071
1072 p02 = sp0[0]; p12 = sp1[0];
1073 p03 = sp0[chan1]; p13 = sp1[chan1];
1074 p04 = sp0[chan2]; p14 = sp1[chan2];
1075 p05 = sp0[chan3]; p15 = sp1[chan3];
1076
1077 sp0 += chan4;
1078 sp1 += chan4;
1079
1080#ifdef __SUNPRO_C
1081#pragma pipeloop(0)
1082#endif /* __SUNPRO_C */
1083 for (i = 0; i <= (wid - 2); i += 2) {
1084 p00 = p02; p10 = p12;
1085 p01 = p03; p11 = p13;
1086 p02 = p04; p12 = p14;
1087 p03 = p05; p13 = p15;
1088
1089 p04 = sp0[0]; p14 = sp1[0];
1090 p05 = sp0[chan1]; p15 = sp1[chan1];
1091
1092 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1093 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1094 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1095 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1096
1097 sp0 += chan2;
1098 sp1 += chan2;
1099 dp += chan2;
1100 }
1101
1102 if (wid & 1) {
1103 p00 = p02; p10 = p12;
1104 p01 = p03; p11 = p13;
1105 p02 = p04; p12 = p14;
1106 p03 = p05; p13 = p15;
1107
1108 p04 = sp0[0]; p14 = sp1[0];
1109
1110 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1111 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1112 }
1113
1114 /*
1115 * Second loop
1116 */
1117 sp0 = sl + 2*sll;
1118 sp1 = sp0 + sll;
1119 dp = dl;
1120
1121 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1122 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1123
1124 p02 = sp0[0]; p12 = sp1[0];
1125 p03 = sp0[chan1]; p13 = sp1[chan1];
1126 p04 = sp0[chan2]; p14 = sp1[chan2];
1127 p05 = sp0[chan3]; p15 = sp1[chan3];
1128
1129 sp0 += chan4;
1130 sp1 += chan4;
1131
1132#ifdef __SUNPRO_C
1133#pragma pipeloop(0)
1134#endif /* __SUNPRO_C */
1135 for (i = 0; i <= (wid - 2); i += 2) {
1136 p00 = p02; p10 = p12;
1137 p01 = p03; p11 = p13;
1138 p02 = p04; p12 = p14;
1139 p03 = p05; p13 = p15;
1140
1141 p04 = sp0[0]; p14 = sp1[0];
1142 p05 = sp0[chan1]; p15 = sp1[chan1];
1143
1144 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1145 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1146 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1147 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1148
1149 sp0 += chan2;
1150 sp1 += chan2;
1151 dp += chan2;
1152 }
1153
1154 if (wid & 1) {
1155 p00 = p02; p10 = p12;
1156 p01 = p03; p11 = p13;
1157 p02 = p04; p12 = p14;
1158 p03 = p05; p13 = p15;
1159
1160 p04 = sp0[0]; p14 = sp1[0];
1161
1162 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1163 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1164 }
1165
1166 /*
1167 * 3 loop
1168 */
1169 dp = dl;
1170 sp0 = sl + 4*sll;
1171
1172 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1173
1174 p02 = sp0[0];
1175 p03 = sp0[chan1];
1176 p04 = sp0[chan2];
1177 p05 = sp0[chan3];
1178
1179 sp0 += chan2 + chan2;
1180
1181#ifdef __SUNPRO_C
1182#pragma pipeloop(0)
1183#endif /* __SUNPRO_C */
1184 for (i = 0; i <= (wid - 2); i += 2) {
1185 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1186
1187 p04 = sp0[0]; p05 = sp0[chan1];
1188
1189 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1190 p03 * k3 + p04 * k4) >> shift2;
1191 pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1192 p04 * k3 + p05 * k4) >> shift2;
1193
1194 CLAMP_STORE(dp[0], pix0);
1195 CLAMP_STORE(dp[chan1], pix1);
1196
1197 dp += chan2;
1198 sp0 += chan2;
1199 }
1200
1201 if (wid & 1) {
1202 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1203
1204 p04 = sp0[0];
1205
1206 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1207 p03 * k3 + p04 * k4) >> shift2;
1208 CLAMP_STORE(dp[0], pix0);
1209 }
1210
1211 /* next line */
1212 sl += sll;
1213 dl += dll;
1214 }
1215 }
1216
1217 if (pbuff != buff) mlib_free(pbuff);
1218
1219 return MLIB_SUCCESS;
1220}
1221
1222#endif /* __sparc ( for x86, using integer multiplies is faster ) */
1223
1224/***************************************************************/
1225#if IMG_TYPE == 1
1226
1227#undef KSIZE
1228#define KSIZE 7
1229
1230mlib_status CONV_FUNC(7x7)(mlib_image *dst,
1231 const mlib_image *src,
1232 const mlib_s32 *kern,
1233 mlib_s32 scalef_expon,
1234 mlib_s32 cmask)
1235{
1236 FTYPE buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1237 FTYPE k[KSIZE*KSIZE];
1238 mlib_s32 l, m, buff_ind;
1239 mlib_s32 d0, d1;
1240 FTYPE k0, k1, k2, k3, k4, k5, k6;
1241 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1242 DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1243 DEF_VARS(DTYPE);
1244 DTYPE *sl1;
1245 mlib_s32 chan2;
1246 mlib_s32 *buffo, *buffi;
1247 LOAD_KERNEL(KSIZE*KSIZE);
1248 GET_SRC_DST_PARAMETERS(DTYPE);
1249
1250 if (wid > BUFF_LINE) {
1251 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE)*wid);
1252
1253 if (pbuff == NULL) return MLIB_FAILURE;
1254 }
1255
1256 for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*wid;
1257 for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1258 buffd = buffs[KSIZE] + wid;
1259 buffo = (mlib_s32*)(buffd + wid);
1260 buffi = buffo + (wid &~ 1);
1261
1262 chan1 = nchannel;
1263 chan2 = chan1 + chan1;
1264
1265 wid -= (KSIZE - 1);
1266 hgt -= (KSIZE - 1);
1267
1268 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
1269
1270 for (c = 0; c < nchannel; c++) {
1271 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1272
1273 sl = adr_src + c;
1274 dl = adr_dst + c;
1275
1276 sl1 = sl + sll;
1277 sl2 = sl1 + sll;
1278 sl3 = sl2 + sll;
1279 sl4 = sl3 + sll;
1280 sl5 = sl4 + sll;
1281 sl6 = sl5 + sll;
1282#ifdef __SUNPRO_C
1283#pragma pipeloop(0)
1284#endif /* __SUNPRO_C */
1285 for (i = 0; i < wid + (KSIZE - 1); i++) {
1286 buffs[0][i] = (FTYPE)sl[i*chan1];
1287 buffs[1][i] = (FTYPE)sl1[i*chan1];
1288 buffs[2][i] = (FTYPE)sl2[i*chan1];
1289 buffs[3][i] = (FTYPE)sl3[i*chan1];
1290 buffs[4][i] = (FTYPE)sl4[i*chan1];
1291 buffs[5][i] = (FTYPE)sl5[i*chan1];
1292 buffs[6][i] = (FTYPE)sl6[i*chan1];
1293 }
1294
1295 buff_ind = 0;
1296
1297#ifdef __SUNPRO_C
1298#pragma pipeloop(0)
1299#endif /* __SUNPRO_C */
1300 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1301
1302 sl += KSIZE*sll;
1303
1304 for (j = 0; j < hgt; j++) {
1305 FTYPE **buffc = buffs + buff_ind;
1306 FTYPE *buffn = buffc[KSIZE];
1307 FTYPE *pk = k;
1308
1309 for (l = 0; l < KSIZE; l++) {
1310 FTYPE *buff = buffc[l];
1311 d64_2x32 dd;
1312
1313 sp = sl;
1314 dp = dl;
1315
1316 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1317 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1318
1319 k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1320 k4 = *pk++; k5 = *pk++; k6 = *pk++;
1321
1322 if (l < (KSIZE - 1)) {
1323#ifdef __SUNPRO_C
1324#pragma pipeloop(0)
1325#endif /* __SUNPRO_C */
1326 for (i = 0; i <= (wid - 2); i += 2) {
1327 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1328
1329 p6 = buff[i + 6]; p7 = buff[i + 7];
1330
1331 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1332 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1333 }
1334
1335 } else {
1336#ifdef __SUNPRO_C
1337#pragma pipeloop(0)
1338#endif /* __SUNPRO_C */
1339 for (i = 0; i <= (wid - 2); i += 2) {
1340 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1341
1342 p6 = buff[i + 6]; p7 = buff[i + 7];
1343
1344 LOAD_BUFF(buffi);
1345
1346 dd.d64 = *(FTYPE *)(buffi + i);
1347 buffn[i ] = (FTYPE)dd.i32s.i0;
1348 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1349
1350 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1351 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1352
1353 dp[0 ] = FROM_S32(d0);
1354 dp[chan1] = FROM_S32(d1);
1355
1356 buffd[i ] = 0.0;
1357 buffd[i + 1] = 0.0;
1358
1359 sp += chan2;
1360 dp += chan2;
1361 }
1362 }
1363 }
1364
1365 /* last pixels */
1366 for (; i < wid; i++) {
1367 FTYPE *pk = k, s = 0;
1368 mlib_s32 d0;
1369
1370 for (l = 0; l < KSIZE; l++) {
1371 FTYPE *buff = buffc[l] + i;
1372
1373 for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1374 }
1375
1376 d0 = D2I(s);
1377 dp[0] = FROM_S32(d0);
1378
1379 buffn[i] = (FTYPE)sp[0];
1380
1381 sp += chan1;
1382 dp += chan1;
1383 }
1384
1385 for (l = 0; l < (KSIZE - 1); l++) buffn[wid + l] = sp[l*chan1];
1386
1387 /* next line */
1388 sl += sll;
1389 dl += dll;
1390
1391 buff_ind++;
1392
1393 if (buff_ind >= KSIZE + 1) buff_ind = 0;
1394 }
1395 }
1396
1397 if (pbuff != buff) mlib_free(pbuff);
1398
1399 return MLIB_SUCCESS;
1400}
1401
1402#endif /* IMG_TYPE == 1 */
1403
1404/***************************************************************/
1405#define MAX_KER 7
1406#define MAX_N 15
1407
1408static mlib_status mlib_ImageConv1xN(mlib_image *dst,
1409 const mlib_image *src,
1410 const mlib_d64 *k,
1411 mlib_s32 n,
1412 mlib_s32 dn,
1413 mlib_s32 cmask)
1414{
1415 FTYPE buff[BUFF_SIZE];
1416 mlib_s32 off, kh;
1417 mlib_s32 d0, d1;
1418 const FTYPE *pk;
1419 FTYPE k0, k1, k2, k3;
1420 FTYPE p0, p1, p2, p3, p4;
1421 DEF_VARS(DTYPE);
1422 DTYPE *sl_c, *dl_c, *sl0;
1423 mlib_s32 l, hsize, max_hsize;
1424 GET_SRC_DST_PARAMETERS(DTYPE);
1425
1426 hgt -= (n - 1);
1427 adr_dst += dn*dll;
1428
1429 max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
1430
1431 if (!max_hsize) max_hsize = 1;
1432
1433 if (max_hsize > BUFF_SIZE) {
1434 pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
1435 }
1436
1437 chan1 = nchannel;
1438
1439 sl_c = adr_src;
1440 dl_c = adr_dst;
1441
1442 for (l = 0; l < hgt; l += hsize) {
1443 hsize = hgt - l;
1444
1445 if (hsize > max_hsize) hsize = max_hsize;
1446
1447 for (c = 0; c < nchannel; c++) {
1448 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1449
1450 sl = sl_c + c;
1451 dl = dl_c + c;
1452
1453#ifdef __SUNPRO_C
1454#pragma pipeloop(0)
1455#endif /* __SUNPRO_C */
1456 for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
1457
1458 for (i = 0; i < wid; i++) {
1459 sl0 = sl;
1460
1461 for (off = 0; off < (n - 4); off += 4) {
1462 pk = k + off;
1463 sp = sl0;
1464
1465 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1466 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1467 sp += 3*sll;
1468
1469#ifdef __SUNPRO_C
1470#pragma pipeloop(0)
1471#endif /* __SUNPRO_C */
1472 for (j = 0; j < hsize; j += 2) {
1473 p0 = p2; p1 = p3; p2 = p4;
1474 p3 = sp[0];
1475 p4 = sp[sll];
1476
1477 pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1478 pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1479
1480 sp += 2*sll;
1481 }
1482
1483 sl0 += 4*sll;
1484 }
1485
1486 pk = k + off;
1487 sp = sl0;
1488
1489 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1490 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
1491
1492 dp = dl;
1493 kh = n - off;
1494
1495 if (kh == 4) {
1496 sp += 3*sll;
1497
1498#ifdef __SUNPRO_C
1499#pragma pipeloop(0)
1500#endif /* __SUNPRO_C */
1501 for (j = 0; j <= (hsize - 2); j += 2) {
1502 p0 = p2; p1 = p3; p2 = p4;
1503 p3 = sp[0];
1504 p4 = sp[sll];
1505
1506 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1507 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);
1508
1509 dp[0 ] = FROM_S32(d0);
1510 dp[dll] = FROM_S32(d1);
1511
1512 pbuff[j] = 0;
1513 pbuff[j + 1] = 0;
1514
1515 sp += 2*sll;
1516 dp += 2*dll;
1517 }
1518
1519 if (j < hsize) {
1520 p0 = p2; p1 = p3; p2 = p4;
1521 p3 = sp[0];
1522
1523 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
1524
1525 pbuff[j] = 0;
1526
1527 dp[0] = FROM_S32(d0);
1528 }
1529
1530 } else if (kh == 3) {
1531 sp += 2*sll;
1532
1533#ifdef __SUNPRO_C
1534#pragma pipeloop(0)
1535#endif /* __SUNPRO_C */
1536 for (j = 0; j <= (hsize - 2); j += 2) {
1537 p0 = p2; p1 = p3;
1538 p2 = sp[0];
1539 p3 = sp[sll];
1540
1541 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1542 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);
1543
1544 dp[0 ] = FROM_S32(d0);
1545 dp[dll] = FROM_S32(d1);
1546
1547 pbuff[j] = 0;
1548 pbuff[j + 1] = 0;
1549
1550 sp += 2*sll;
1551 dp += 2*dll;
1552 }
1553
1554 if (j < hsize) {
1555 p0 = p2; p1 = p3;
1556 p2 = sp[0];
1557
1558 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
1559
1560 pbuff[j] = 0;
1561
1562 dp[0] = FROM_S32(d0);
1563 }
1564
1565 } else if (kh == 2) {
1566 sp += sll;
1567
1568#ifdef __SUNPRO_C
1569#pragma pipeloop(0)
1570#endif /* __SUNPRO_C */
1571 for (j = 0; j <= (hsize - 2); j += 2) {
1572 p0 = p2;
1573 p1 = sp[0];
1574 p2 = sp[sll];
1575
1576 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1577 d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);
1578
1579 dp[0 ] = FROM_S32(d0);
1580 dp[dll] = FROM_S32(d1);
1581
1582 pbuff[j] = 0;
1583 pbuff[j + 1] = 0;
1584
1585 sp += 2*sll;
1586 dp += 2*dll;
1587 }
1588
1589 if (j < hsize) {
1590 p0 = p2;
1591 p1 = sp[0];
1592
1593 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
1594
1595 pbuff[j] = 0;
1596
1597 dp[0] = FROM_S32(d0);
1598 }
1599
1600 } else /* if (kh == 1) */ {
1601#ifdef __SUNPRO_C
1602#pragma pipeloop(0)
1603#endif /* __SUNPRO_C */
1604 for (j = 0; j < hsize; j++) {
1605 p0 = sp[0];
1606
1607 d0 = D2I(p0*k0 + pbuff[j]);
1608
1609 dp[0] = FROM_S32(d0);
1610
1611 pbuff[j] = 0;
1612
1613 sp += sll;
1614 dp += dll;
1615 }
1616 }
1617
1618 sl += chan1;
1619 dl += chan1;
1620 }
1621 }
1622
1623 sl_c += max_hsize*sll;
1624 dl_c += max_hsize*dll;
1625 }
1626
1627 if (pbuff != buff) mlib_free(pbuff);
1628
1629 return MLIB_SUCCESS;
1630}
1631
1632/***************************************************************/
1633mlib_status CONV_FUNC(MxN)(mlib_image *dst,
1634 const mlib_image *src,
1635 const mlib_s32 *kernel,
1636 mlib_s32 m,
1637 mlib_s32 n,
1638 mlib_s32 dm,
1639 mlib_s32 dn,
1640 mlib_s32 scale,
1641 mlib_s32 cmask)
1642{
1643 FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1644 FTYPE **buffs = buffs_arr, *buffd;
1645 FTYPE akernel[256], *k = akernel, fscale = DSCALE;
1646 mlib_s32 mn, l, off, kw, bsize, buff_ind;
1647 mlib_s32 d0, d1;
1648 FTYPE k0, k1, k2, k3, k4, k5, k6;
1649 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1650 d64_2x32 dd;
1651 DEF_VARS(DTYPE);
1652 mlib_s32 chan2;
1653 mlib_s32 *buffo, *buffi;
1654 GET_SRC_DST_PARAMETERS(DTYPE);
1655
1656 if (scale > 30) {
1657 fscale *= 1.0/(1 << 30);
1658 scale -= 30;
1659 }
1660
1661 fscale /= (1 << scale);
1662
1663 mn = m*n;
1664
1665 if (mn > 256) {
1666 k = mlib_malloc(mn*sizeof(mlib_d64));
1667
1668 if (k == NULL) return MLIB_FAILURE;
1669 }
1670
1671 for (i = 0; i < mn; i++) {
1672 k[i] = kernel[i]*fscale;
1673 }
1674
1675 if (m == 1) return mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
1676
1677 bsize = (n + 3)*wid;
1678
1679 if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1680 pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
1681
1682 if (pbuff == NULL) return MLIB_FAILURE;
1683 buffs = (FTYPE **)(pbuff + bsize);
1684 }
1685
1686 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
1687 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1688 buffd = buffs[n] + wid;
1689 buffo = (mlib_s32*)(buffd + wid);
1690 buffi = buffo + (wid &~ 1);
1691
1692 chan1 = nchannel;
1693 chan2 = chan1 + chan1;
1694
1695 wid -= (m - 1);
1696 hgt -= (n - 1);
1697 adr_dst += dn*dll + dm*nchannel;
1698
1699 for (c = 0; c < nchannel; c++) {
1700 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1701
1702 sl = adr_src + c;
1703 dl = adr_dst + c;
1704
1705 for (l = 0; l < n; l++) {
1706 FTYPE *buff = buffs[l];
1707
1708#ifdef __SUNPRO_C
1709#pragma pipeloop(0)
1710#endif /* __SUNPRO_C */
1711 for (i = 0; i < wid + (m - 1); i++) {
1712 buff[i] = (FTYPE)sl[i*chan1];
1713 }
1714
1715 sl += sll;
1716 }
1717
1718 buff_ind = 0;
1719
1720#ifdef __SUNPRO_C
1721#pragma pipeloop(0)
1722#endif /* __SUNPRO_C */
1723 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1724
1725 for (j = 0; j < hgt; j++) {
1726 FTYPE **buffc = buffs + buff_ind;
1727 FTYPE *buffn = buffc[n];
1728 FTYPE *pk = k;
1729
1730 for (l = 0; l < n; l++) {
1731 FTYPE *buff_l = buffc[l];
1732
1733 for (off = 0; off < m;) {
1734 FTYPE *buff = buff_l + off;
1735
1736 kw = m - off;
1737
1738 if (kw > 2*MAX_KER) kw = MAX_KER; else
1739 if (kw > MAX_KER) kw = kw/2;
1740 off += kw;
1741
1742 sp = sl;
1743 dp = dl;
1744
1745 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1746 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1747
1748 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1749 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1750 pk += kw;
1751
1752 if (kw == 7) {
1753
1754 if (l < (n - 1) || off < m) {
1755#ifdef __SUNPRO_C
1756#pragma pipeloop(0)
1757#endif /* __SUNPRO_C */
1758 for (i = 0; i <= (wid - 2); i += 2) {
1759 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1760
1761 p6 = buff[i + 6]; p7 = buff[i + 7];
1762
1763 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1764 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1765 }
1766
1767 } else {
1768#ifdef __SUNPRO_C
1769#pragma pipeloop(0)
1770#endif /* __SUNPRO_C */
1771 for (i = 0; i <= (wid - 2); i += 2) {
1772 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1773
1774 p6 = buff[i + 6]; p7 = buff[i + 7];
1775
1776 LOAD_BUFF(buffi);
1777
1778 dd.d64 = *(FTYPE *)(buffi + i);
1779 buffn[i ] = (FTYPE)dd.i32s.i0;
1780 buffn[i + 1] = (FTYPE)dd.i32s.i1;
1781
1782 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1783 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1784
1785 dp[0 ] = FROM_S32(d0);
1786 dp[chan1] = FROM_S32(d1);
1787
1788 buffd[i ] = 0.0;
1789 buffd[i + 1] = 0.0;
1790
1791 sp += chan2;
1792 dp += chan2;
1793 }
1794 }
1795
1796 } else if (kw == 6) {
1797
1798 if (l < (n - 1) || off < m) {
1799#ifdef __SUNPRO_C
1800#pragma pipeloop(0)
1801#endif /* __SUNPRO_C */
1802 for (i = 0; i <= (wid - 2); i += 2) {
1803 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1804
1805 p5 = buff[i + 5]; p6 = buff[i + 6];
1806
1807 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1808 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1809 }
1810
1811 } else {
1812#ifdef __SUNPRO_C
1813#pragma pipeloop(0)
1814#endif /* __SUNPRO_C */
1815 for (i = 0; i <= (wid - 2); i += 2) {
1816 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1817
1818 p5 = buff[i + 5]; p6 = buff[i + 6];
1819
1820 buffn[i ] = (FTYPE)sp[0];
1821 buffn[i + 1] = (FTYPE)sp[chan1];
1822
1823 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
1824 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1825
1826 dp[0 ] = FROM_S32(d0);
1827 dp[chan1] = FROM_S32(d1);
1828
1829 buffd[i ] = 0.0;
1830 buffd[i + 1] = 0.0;
1831
1832 sp += chan2;
1833 dp += chan2;
1834 }
1835 }
1836
1837 } else if (kw == 5) {
1838
1839 if (l < (n - 1) || off < m) {
1840#ifdef __SUNPRO_C
1841#pragma pipeloop(0)
1842#endif /* __SUNPRO_C */
1843 for (i = 0; i <= (wid - 2); i += 2) {
1844 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1845
1846 p4 = buff[i + 4]; p5 = buff[i + 5];
1847
1848 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1849 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1850 }
1851
1852 } else {
1853#ifdef __SUNPRO_C
1854#pragma pipeloop(0)
1855#endif /* __SUNPRO_C */
1856 for (i = 0; i <= (wid - 2); i += 2) {
1857 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1858
1859 p4 = buff[i + 4]; p5 = buff[i + 5];
1860
1861 buffn[i ] = (FTYPE)sp[0];
1862 buffn[i + 1] = (FTYPE)sp[chan1];
1863
1864 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
1865 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1866
1867 dp[0 ] = FROM_S32(d0);
1868 dp[chan1] = FROM_S32(d1);
1869
1870 buffd[i ] = 0.0;
1871 buffd[i + 1] = 0.0;
1872
1873 sp += chan2;
1874 dp += chan2;
1875 }
1876 }
1877
1878 } else if (kw == 4) {
1879
1880 if (l < (n - 1) || off < m) {
1881#ifdef __SUNPRO_C
1882#pragma pipeloop(0)
1883#endif /* __SUNPRO_C */
1884 for (i = 0; i <= (wid - 2); i += 2) {
1885 p0 = p2; p1 = p3; p2 = p4;
1886
1887 p3 = buff[i + 3]; p4 = buff[i + 4];
1888
1889 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1890 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1891 }
1892
1893 } else {
1894#ifdef __SUNPRO_C
1895#pragma pipeloop(0)
1896#endif /* __SUNPRO_C */
1897 for (i = 0; i <= (wid - 2); i += 2) {
1898 p0 = p2; p1 = p3; p2 = p4;
1899
1900 p3 = buff[i + 3]; p4 = buff[i + 4];
1901
1902 buffn[i ] = (FTYPE)sp[0];
1903 buffn[i + 1] = (FTYPE)sp[chan1];
1904
1905 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
1906 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1907
1908 dp[0 ] = FROM_S32(d0);
1909 dp[chan1] = FROM_S32(d1);
1910
1911 buffd[i ] = 0.0;
1912 buffd[i + 1] = 0.0;
1913
1914 sp += chan2;
1915 dp += chan2;
1916 }
1917 }
1918
1919 } else if (kw == 3) {
1920
1921 if (l < (n - 1) || off < m) {
1922#ifdef __SUNPRO_C
1923#pragma pipeloop(0)
1924#endif /* __SUNPRO_C */
1925 for (i = 0; i <= (wid - 2); i += 2) {
1926 p0 = p2; p1 = p3;
1927
1928 p2 = buff[i + 2]; p3 = buff[i + 3];
1929
1930 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
1931 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1932 }
1933
1934 } else {
1935#ifdef __SUNPRO_C
1936#pragma pipeloop(0)
1937#endif /* __SUNPRO_C */
1938 for (i = 0; i <= (wid - 2); i += 2) {
1939 p0 = p2; p1 = p3;
1940
1941 p2 = buff[i + 2]; p3 = buff[i + 3];
1942
1943 buffn[i ] = (FTYPE)sp[0];
1944 buffn[i + 1] = (FTYPE)sp[chan1];
1945
1946 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
1947 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1948
1949 dp[0 ] = FROM_S32(d0);
1950 dp[chan1] = FROM_S32(d1);
1951
1952 buffd[i ] = 0.0;
1953 buffd[i + 1] = 0.0;
1954
1955 sp += chan2;
1956 dp += chan2;
1957 }
1958 }
1959
1960 } else /*if (kw == 2)*/ {
1961
1962 if (l < (n - 1) || off < m) {
1963#ifdef __SUNPRO_C
1964#pragma pipeloop(0)
1965#endif /* __SUNPRO_C */
1966 for (i = 0; i <= (wid - 2); i += 2) {
1967 p0 = p2;
1968
1969 p1 = buff[i + 1]; p2 = buff[i + 2];
1970
1971 buffd[i ] += p0*k0 + p1*k1;
1972 buffd[i + 1] += p1*k0 + p2*k1;
1973 }
1974
1975 } else {
1976#ifdef __SUNPRO_C
1977#pragma pipeloop(0)
1978#endif /* __SUNPRO_C */
1979 for (i = 0; i <= (wid - 2); i += 2) {
1980 p0 = p2;
1981
1982 p1 = buff[i + 1]; p2 = buff[i + 2];
1983
1984 buffn[i ] = (FTYPE)sp[0];
1985 buffn[i + 1] = (FTYPE)sp[chan1];
1986
1987 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]);
1988 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
1989
1990 dp[0 ] = FROM_S32(d0);
1991 dp[chan1] = FROM_S32(d1);
1992
1993 buffd[i ] = 0.0;
1994 buffd[i + 1] = 0.0;
1995
1996 sp += chan2;
1997 dp += chan2;
1998 }
1999 }
2000 }
2001 }
2002 }
2003
2004 /* last pixels */
2005 for (; i < wid; i++) {
2006 FTYPE *pk = k, s = 0;
2007 mlib_s32 x, d0;
2008
2009 for (l = 0; l < n; l++) {
2010 FTYPE *buff = buffc[l] + i;
2011
2012 for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2013 }
2014
2015 d0 = D2I(s);
2016 dp[0] = FROM_S32(d0);
2017
2018 buffn[i] = (FTYPE)sp[0];
2019
2020 sp += chan1;
2021 dp += chan1;
2022 }
2023
2024 for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
2025
2026 /* next line */
2027 sl += sll;
2028 dl += dll;
2029
2030 buff_ind++;
2031
2032 if (buff_ind >= n + 1) buff_ind = 0;
2033 }
2034 }
2035
2036 if (pbuff != buff) mlib_free(pbuff);
2037
2038 return MLIB_SUCCESS;
2039}
2040
2041/***************************************************************/
2042#ifndef __sparc /* for x86, using integer multiplies is faster */
2043
2044#define STORE_RES(res, x) \
2045 x >>= shift2; \
2046 CLAMP_STORE(res, x)
2047
2048mlib_status CONV_FUNC_I(MxN)(mlib_image *dst,
2049 const mlib_image *src,
2050 const mlib_s32 *kernel,
2051 mlib_s32 m,
2052 mlib_s32 n,
2053 mlib_s32 dm,
2054 mlib_s32 dn,
2055 mlib_s32 scale,
2056 mlib_s32 cmask)
2057{
2058 mlib_s32 buff[BUFF_SIZE], *buffd = buff;
2059 mlib_s32 l, off, kw;
2060 mlib_s32 d0, d1, shift1, shift2;
2061 mlib_s32 k0, k1, k2, k3, k4, k5, k6;
2062 mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
2063 DTYPE *adr_src, *sl, *sp;
2064 DTYPE *adr_dst, *dl, *dp;
2065 mlib_s32 wid, hgt, sll, dll;
2066 mlib_s32 nchannel, chan1;
2067 mlib_s32 i, j, c;
2068 mlib_s32 chan2;
2069 mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
2070 GET_SRC_DST_PARAMETERS(DTYPE);
2071
2072#if IMG_TYPE != 1
2073 shift1 = 16;
2074#else
2075 shift1 = 8;
2076#endif /* IMG_TYPE != 1 */
2077 shift2 = scale - shift1;
2078
2079 chan1 = nchannel;
2080 chan2 = chan1 + chan1;
2081
2082 wid -= (m - 1);
2083 hgt -= (n - 1);
2084 adr_dst += dn*dll + dm*nchannel;
2085
2086 if (wid > BUFF_SIZE) {
2087 buffd = mlib_malloc(sizeof(mlib_s32)*wid);
2088
2089 if (buffd == NULL) return MLIB_FAILURE;
2090 }
2091
2092 if (m*n > MAX_N*MAX_N) {
2093 k = mlib_malloc(sizeof(mlib_s32)*(m*n));
2094
2095 if (k == NULL) {
2096 if (buffd != buff) mlib_free(buffd);
2097 return MLIB_FAILURE;
2098 }
2099 }
2100
2101 for (i = 0; i < m*n; i++) {
2102 k[i] = kernel[i] >> shift1;
2103 }
2104
2105 for (c = 0; c < nchannel; c++) {
2106 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
2107
2108 sl = adr_src + c;
2109 dl = adr_dst + c;
2110
2111#ifdef __SUNPRO_C
2112#pragma pipeloop(0)
2113#endif /* __SUNPRO_C */
2114 for (i = 0; i < wid; i++) buffd[i] = 0;
2115
2116 for (j = 0; j < hgt; j++) {
2117 mlib_s32 *pk = k;
2118
2119 for (l = 0; l < n; l++) {
2120 DTYPE *sp0 = sl + l*sll;
2121
2122 for (off = 0; off < m;) {
2123 sp = sp0 + off*chan1;
2124 dp = dl;
2125
2126 kw = m - off;
2127
2128 if (kw > 2*MAX_KER) kw = MAX_KER; else
2129 if (kw > MAX_KER) kw = kw/2;
2130 off += kw;
2131
2132 p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
2133 p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];
2134
2135 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2136 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
2137 pk += kw;
2138
2139 sp += (kw - 1)*chan1;
2140
2141 if (kw == 7) {
2142
2143 if (l < (n - 1) || off < m) {
2144#ifdef __SUNPRO_C
2145#pragma pipeloop(0)
2146#endif /* __SUNPRO_C */
2147 for (i = 0; i <= (wid - 2); i += 2) {
2148 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2149 p6 = sp[0];
2150 p7 = sp[chan1];
2151
2152 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2153 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2154
2155 sp += chan2;
2156 }
2157
2158 } else {
2159#ifdef __SUNPRO_C
2160#pragma pipeloop(0)
2161#endif /* __SUNPRO_C */
2162 for (i = 0; i <= (wid - 2); i += 2) {
2163 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2164 p6 = sp[0];
2165 p7 = sp[chan1];
2166
2167 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
2168 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2169
2170 STORE_RES(dp[0 ], d0);
2171 STORE_RES(dp[chan1], d1);
2172
2173 buffd[i ] = 0;
2174 buffd[i + 1] = 0;
2175
2176 sp += chan2;
2177 dp += chan2;
2178 }
2179 }
2180
2181 } else if (kw == 6) {
2182
2183 if (l < (n - 1) || off < m) {
2184#ifdef __SUNPRO_C
2185#pragma pipeloop(0)
2186#endif /* __SUNPRO_C */
2187 for (i = 0; i <= (wid - 2); i += 2) {
2188 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2189 p5 = sp[0];
2190 p6 = sp[chan1];
2191
2192 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2193 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2194
2195 sp += chan2;
2196 }
2197
2198 } else {
2199#ifdef __SUNPRO_C
2200#pragma pipeloop(0)
2201#endif /* __SUNPRO_C */
2202 for (i = 0; i <= (wid - 2); i += 2) {
2203 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2204 p5 = sp[0];
2205 p6 = sp[chan1];
2206
2207 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
2208 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2209
2210 STORE_RES(dp[0 ], d0);
2211 STORE_RES(dp[chan1], d1);
2212
2213 buffd[i ] = 0;
2214 buffd[i + 1] = 0;
2215
2216 sp += chan2;
2217 dp += chan2;
2218 }
2219 }
2220
2221 } else if (kw == 5) {
2222
2223 if (l < (n - 1) || off < m) {
2224#ifdef __SUNPRO_C
2225#pragma pipeloop(0)
2226#endif /* __SUNPRO_C */
2227 for (i = 0; i <= (wid - 2); i += 2) {
2228 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2229 p4 = sp[0];
2230 p5 = sp[chan1];
2231
2232 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2233 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2234
2235 sp += chan2;
2236 }
2237
2238 } else {
2239#ifdef __SUNPRO_C
2240#pragma pipeloop(0)
2241#endif /* __SUNPRO_C */
2242 for (i = 0; i <= (wid - 2); i += 2) {
2243 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2244 p4 = sp[0];
2245 p5 = sp[chan1];
2246
2247 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
2248 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2249
2250 STORE_RES(dp[0 ], d0);
2251 STORE_RES(dp[chan1], d1);
2252
2253 buffd[i ] = 0;
2254 buffd[i + 1] = 0;
2255
2256 sp += chan2;
2257 dp += chan2;
2258 }
2259 }
2260
2261 } else if (kw == 4) {
2262
2263 if (l < (n - 1) || off < m) {
2264#ifdef __SUNPRO_C
2265#pragma pipeloop(0)
2266#endif /* __SUNPRO_C */
2267 for (i = 0; i <= (wid - 2); i += 2) {
2268 p0 = p2; p1 = p3; p2 = p4;
2269 p3 = sp[0];
2270 p4 = sp[chan1];
2271
2272 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2273 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2274
2275 sp += chan2;
2276 }
2277
2278 } else {
2279#ifdef __SUNPRO_C
2280#pragma pipeloop(0)
2281#endif /* __SUNPRO_C */
2282 for (i = 0; i <= (wid - 2); i += 2) {
2283 p0 = p2; p1 = p3; p2 = p4;
2284 p3 = sp[0];
2285 p4 = sp[chan1];
2286
2287 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
2288 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2289
2290 STORE_RES(dp[0 ], d0);
2291 STORE_RES(dp[chan1], d1);
2292
2293 buffd[i ] = 0;
2294 buffd[i + 1] = 0;
2295
2296 sp += chan2;
2297 dp += chan2;
2298 }
2299 }
2300
2301 } else if (kw == 3) {
2302
2303 if (l < (n - 1) || off < m) {
2304#ifdef __SUNPRO_C
2305#pragma pipeloop(0)
2306#endif /* __SUNPRO_C */
2307 for (i = 0; i <= (wid - 2); i += 2) {
2308 p0 = p2; p1 = p3;
2309 p2 = sp[0];
2310 p3 = sp[chan1];
2311
2312 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
2313 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2314
2315 sp += chan2;
2316 }
2317
2318 } else {
2319#ifdef __SUNPRO_C
2320#pragma pipeloop(0)
2321#endif /* __SUNPRO_C */
2322 for (i = 0; i <= (wid - 2); i += 2) {
2323 p0 = p2; p1 = p3;
2324 p2 = sp[0];
2325 p3 = sp[chan1];
2326
2327 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
2328 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2329
2330 STORE_RES(dp[0 ], d0);
2331 STORE_RES(dp[chan1], d1);
2332
2333 buffd[i ] = 0;
2334 buffd[i + 1] = 0;
2335
2336 sp += chan2;
2337 dp += chan2;
2338 }
2339 }
2340
2341 } else if (kw == 2) {
2342
2343 if (l < (n - 1) || off < m) {
2344#ifdef __SUNPRO_C
2345#pragma pipeloop(0)
2346#endif /* __SUNPRO_C */
2347 for (i = 0; i <= (wid - 2); i += 2) {
2348 p0 = p2;
2349 p1 = sp[0];
2350 p2 = sp[chan1];
2351
2352 buffd[i ] += p0*k0 + p1*k1;
2353 buffd[i + 1] += p1*k0 + p2*k1;
2354
2355 sp += chan2;
2356 }
2357
2358 } else {
2359#ifdef __SUNPRO_C
2360#pragma pipeloop(0)
2361#endif /* __SUNPRO_C */
2362 for (i = 0; i <= (wid - 2); i += 2) {
2363 p0 = p2;
2364 p1 = sp[0];
2365 p2 = sp[chan1];
2366
2367 d0 = (p0*k0 + p1*k1 + buffd[i ]);
2368 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
2369
2370 STORE_RES(dp[0 ], d0);
2371 STORE_RES(dp[chan1], d1);
2372
2373 buffd[i ] = 0;
2374 buffd[i + 1] = 0;
2375
2376 sp += chan2;
2377 dp += chan2;
2378 }
2379 }
2380
2381 } else /*if (kw == 1)*/ {
2382
2383 if (l < (n - 1) || off < m) {
2384#ifdef __SUNPRO_C
2385#pragma pipeloop(0)
2386#endif /* __SUNPRO_C */
2387 for (i = 0; i <= (wid - 2); i += 2) {
2388 p0 = sp[0];
2389 p1 = sp[chan1];
2390
2391 buffd[i ] += p0*k0;
2392 buffd[i + 1] += p1*k0;
2393
2394 sp += chan2;
2395 }
2396
2397 } else {
2398#ifdef __SUNPRO_C
2399#pragma pipeloop(0)
2400#endif /* __SUNPRO_C */
2401 for (i = 0; i <= (wid - 2); i += 2) {
2402 p0 = sp[0];
2403 p1 = sp[chan1];
2404
2405 d0 = (p0*k0 + buffd[i ]);
2406 d1 = (p1*k0 + buffd[i + 1]);
2407
2408 STORE_RES(dp[0 ], d0);
2409 STORE_RES(dp[chan1], d1);
2410
2411 buffd[i ] = 0;
2412 buffd[i + 1] = 0;
2413
2414 sp += chan2;
2415 dp += chan2;
2416 }
2417 }
2418 }
2419 }
2420 }
2421
2422 /* last pixels */
2423 for (; i < wid; i++) {
2424 mlib_s32 *pk = k, s = 0;
2425 mlib_s32 x;
2426
2427 for (l = 0; l < n; l++) {
2428 sp = sl + l*sll + i*chan1;
2429
2430 for (x = 0; x < m; x++) {
2431 s += sp[0] * pk[0];
2432 sp += chan1;
2433 pk ++;
2434 }
2435 }
2436
2437 STORE_RES(dp[0], s);
2438
2439 sp += chan1;
2440 dp += chan1;
2441 }
2442
2443 sl += sll;
2444 dl += dll;
2445 }
2446 }
2447
2448 if (buffd != buff) mlib_free(buffd);
2449 if (k != k_locl) mlib_free(k);
2450
2451 return MLIB_SUCCESS;
2452}
2453
2454/***************************************************************/
2455#endif /* __sparc ( for x86, using integer multiplies is faster ) */
2456
2457/***************************************************************/