blob: 77ab8dc7e3289fa1cdebfb2114ac82e8b99a168a [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2003 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26
27/*
28 * FUNCTION
29 * Internal functions for mlib_ImageConv* on U8/S16/U16 type and
30 * MLIB_EDGE_SRC_EXTEND mask
31 */
32
33#include "mlib_image.h"
34#include "mlib_ImageConv.h"
35#include "mlib_c_ImageConv.h"
36
37/*
38 * This define switches between functions of different data types
39 */
40
41#define IMG_TYPE 1
42
43/***************************************************************/
44#if IMG_TYPE == 1
45
46#define DTYPE mlib_u8
47#define CONV_FUNC(KERN) mlib_c_conv##KERN##ext_u8(PARAM)
48#define CONV_FUNC_MxN mlib_c_convMxNext_u8(PARAM_MxN)
49#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
50#define CONV_FUNC_MxN_I mlib_i_convMxNext_u8(PARAM_MxN)
51#define DSCALE (1 << 24)
52#define FROM_S32(x) (((x) >> 24) ^ 128)
53#define S64TOS32(x) (x)
54#define SAT_OFF -(1u << 31)
55
56#elif IMG_TYPE == 2
57
58#define DTYPE mlib_s16
59#define CONV_FUNC(KERN) mlib_conv##KERN##ext_s16(PARAM)
60#define CONV_FUNC_MxN mlib_convMxNext_s16(PARAM_MxN)
61#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
62#define CONV_FUNC_MxN_I mlib_i_convMxNext_s16(PARAM_MxN)
63#define DSCALE 65536.0
64#define FROM_S32(x) ((x) >> 16)
65#define S64TOS32(x) ((x) & 0xffffffff)
66#define SAT_OFF
67
68#elif IMG_TYPE == 3
69
70#define DTYPE mlib_u16
71#define CONV_FUNC(KERN) mlib_conv##KERN##ext_u16(PARAM)
72#define CONV_FUNC_MxN mlib_convMxNext_u16(PARAM_MxN)
73#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
74#define CONV_FUNC_MxN_I mlib_i_convMxNext_u16(PARAM_MxN)
75#define DSCALE 65536.0
76#define FROM_S32(x) (((x) >> 16) ^ 0x8000)
77#define S64TOS32(x) (x)
78#define SAT_OFF -(1u << 31)
79
80#endif /* IMG_TYPE == 1 */
81
82/***************************************************************/
83#define KSIZE1 (KSIZE - 1)
84
85/***************************************************************/
86#define PARAM \
87 mlib_image *dst, \
88 const mlib_image *src, \
89 mlib_s32 dx_l, \
90 mlib_s32 dx_r, \
91 mlib_s32 dy_t, \
92 mlib_s32 dy_b, \
93 const mlib_s32 *kern, \
94 mlib_s32 scalef_expon, \
95 mlib_s32 cmask
96
97/***************************************************************/
98#define PARAM_MxN \
99 mlib_image *dst, \
100 const mlib_image *src, \
101 const mlib_s32 *kernel, \
102 mlib_s32 m, \
103 mlib_s32 n, \
104 mlib_s32 dx_l, \
105 mlib_s32 dx_r, \
106 mlib_s32 dy_t, \
107 mlib_s32 dy_b, \
108 mlib_s32 scale, \
109 mlib_s32 cmask
110
111/***************************************************************/
112#define FTYPE mlib_d64
113
114#ifndef MLIB_USE_FTOI_CLAMPING
115
116#define CLAMP_S32(x) \
117 (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
118
119#else
120
121#define CLAMP_S32(x) ((mlib_s32)(x))
122
123#endif /* MLIB_USE_FTOI_CLAMPING */
124
125/***************************************************************/
126#define D2I(x) CLAMP_S32((x) SAT_OFF)
127
128/***************************************************************/
129#ifdef _LITTLE_ENDIAN
130
131#define STORE2(res0, res1) \
132 dp[0 ] = res1; \
133 dp[chan1] = res0
134
135#else
136
137#define STORE2(res0, res1) \
138 dp[0 ] = res0; \
139 dp[chan1] = res1
140
141#endif /* _LITTLE_ENDIAN */
142
143/***************************************************************/
144#ifdef _NO_LONGLONG
145
146#define LOAD_BUFF(buff) \
147 buff[i ] = sp[0]; \
148 buff[i + 1] = sp[chan1]
149
150#else /* _NO_LONGLONG */
151
152#ifdef _LITTLE_ENDIAN
153
154#define LOAD_BUFF(buff) \
155 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
156
157#else /* _LITTLE_ENDIAN */
158
159#define LOAD_BUFF(buff) \
160 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
161
162#endif /* _LITTLE_ENDIAN */
163#endif /* _NO_LONGLONG */
164
165/***************************************************************/
166#define MLIB_D2_24 16777216.0f
167
168/***************************************************************/
169typedef union {
170 mlib_d64 d64;
171 struct {
172 mlib_s32 i0;
173 mlib_s32 i1;
174 } i32s;
175} d64_2x32;
176
177/***************************************************************/
178#define BUFF_LINE 256
179
180/***************************************************************/
181#define DEF_VARS(type) \
182 type *adr_src, *sl, *sp, *sl1; \
183 type *adr_dst, *dl, *dp; \
184 FTYPE *pbuff = buff; \
185 mlib_s32 *buffi, *buffo; \
186 mlib_s32 wid, hgt, sll, dll; \
187 mlib_s32 nchannel, chan1, chan2; \
188 mlib_s32 i, j, c, swid
189
190/***************************************************************/
191#define LOAD_KERNEL3() \
192 FTYPE scalef = DSCALE; \
193 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8; \
194 FTYPE p00, p01, p02, p03, \
195 p10, p11, p12, p13, \
196 p20, p21, p22, p23; \
197 \
198 while (scalef_expon > 30) { \
199 scalef /= (1 << 30); \
200 scalef_expon -= 30; \
201 } \
202 \
203 scalef /= (1 << scalef_expon); \
204 \
205 /* keep kernel in regs */ \
206 k0 = scalef * kern[0]; k1 = scalef * kern[1]; k2 = scalef * kern[2]; \
207 k3 = scalef * kern[3]; k4 = scalef * kern[4]; k5 = scalef * kern[5]; \
208 k6 = scalef * kern[6]; k7 = scalef * kern[7]; k8 = scalef * kern[8]
209
210/***************************************************************/
211#define LOAD_KERNEL(SIZE) \
212 FTYPE scalef = DSCALE; \
213 \
214 while (scalef_expon > 30) { \
215 scalef /= (1 << 30); \
216 scalef_expon -= 30; \
217 } \
218 \
219 scalef /= (1 << scalef_expon); \
220 \
221 for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
222
223/***************************************************************/
224#define GET_SRC_DST_PARAMETERS(type) \
225 hgt = mlib_ImageGetHeight(src); \
226 wid = mlib_ImageGetWidth(src); \
227 nchannel = mlib_ImageGetChannels(src); \
228 sll = mlib_ImageGetStride(src) / sizeof(type); \
229 dll = mlib_ImageGetStride(dst) / sizeof(type); \
230 adr_src = (type *)mlib_ImageGetData(src); \
231 adr_dst = (type *)mlib_ImageGetData(dst)
232
233/***************************************************************/
234#ifndef __sparc
235#if IMG_TYPE == 1
236
237/*
238 * Test for the presence of any "1" bit in bits
239 8 to 31 of val. If present, then val is either
240 negative or >255. If over/underflows of 8 bits
241 are uncommon, then this technique can be a win,
242 since only a single test, rather than two, is
243 necessary to determine if clamping is needed.
244 On the other hand, if over/underflows are common,
245 it adds an extra test.
246*/
247#define CLAMP_STORE(dst, val) \
248 if (val & 0xffffff00) { \
249 if (val < MLIB_U8_MIN) \
250 dst = MLIB_U8_MIN; \
251 else \
252 dst = MLIB_U8_MAX; \
253 } else { \
254 dst = (mlib_u8)val; \
255 }
256
257#elif IMG_TYPE == 2
258
259#define CLAMP_STORE(dst, val) \
260 if (val >= MLIB_S16_MAX) \
261 dst = MLIB_S16_MAX; \
262 else if (val <= MLIB_S16_MIN) \
263 dst = MLIB_S16_MIN; \
264 else \
265 dst = (mlib_s16)val
266
267#elif IMG_TYPE == 3
268
269#define CLAMP_STORE(dst, val) \
270 if (val >= MLIB_U16_MAX) \
271 dst = MLIB_U16_MAX; \
272 else if (val <= MLIB_U16_MIN) \
273 dst = MLIB_U16_MIN; \
274 else \
275 dst = (mlib_u16)val
276
277#endif /* IMG_TYPE == 1 */
278#endif /* __sparc */
279
280/***************************************************************/
281#define KSIZE 3
282
283mlib_status CONV_FUNC(3x3)
284{
285 FTYPE buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
286 DEF_VARS(DTYPE);
287 DTYPE *sl2;
288#ifndef __sparc
289 mlib_s32 d0, d1;
290#endif /* __sparc */
291 LOAD_KERNEL3();
292 GET_SRC_DST_PARAMETERS(DTYPE);
293
294 swid = wid + KSIZE1;
295
296 if (swid > BUFF_LINE) {
297 pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE )*swid);
298
299 if (pbuff == NULL) return MLIB_FAILURE;
300 }
301
302 buff0 = pbuff;
303 buff1 = buff0 + swid;
304 buff2 = buff1 + swid;
305 buff3 = buff2 + swid;
306 buffo = (mlib_s32*)(buff3 + swid);
307 buffi = buffo + (swid &~ 1);
308
309 swid -= (dx_l + dx_r);
310
311 chan1 = nchannel;
312 chan2 = chan1 + chan1;
313
314 for (c = 0; c < nchannel; c++) {
315 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
316
317 sl = adr_src + c;
318 dl = adr_dst + c;
319
320 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
321 else sl1 = sl;
322
323 if ((hgt - dy_b) > 0) sl2 = sl1 + sll;
324 else sl2 = sl1;
325
326 for (i = 0; i < dx_l; i++) {
327 buff0[i] = (FTYPE)sl[0];
328 buff1[i] = (FTYPE)sl1[0];
329 buff2[i] = (FTYPE)sl2[0];
330 }
331
332#ifdef __SUNPRO_C
333#pragma pipeloop(0)
334#endif /* __SUNPRO_C */
335 for (i = 0; i < swid; i++) {
336 buff0[i + dx_l] = (FTYPE)sl[i*chan1];
337 buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
338 buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
339 }
340
341 for (i = 0; i < dx_r; i++) {
342 buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
343 buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
344 buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
345 }
346
347 if ((hgt - dy_b) > 1) sl = sl2 + sll;
348 else sl = sl2;
349
350 for (j = 0; j < hgt; j++) {
351 FTYPE s0, s1;
352
353 p02 = buff0[0];
354 p12 = buff1[0];
355 p22 = buff2[0];
356
357 p03 = buff0[1];
358 p13 = buff1[1];
359 p23 = buff2[1];
360
361 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
362 s1 = p03 * k0 + p13 * k3 + p23 * k6;
363
364 sp = sl;
365 dp = dl;
366
367#ifdef __SUNPRO_C
368#pragma pipeloop(0)
369#endif /* __SUNPRO_C */
370 for (i = 0; i <= (wid - 2); i += 2) {
371#ifdef __sparc
372#ifdef _NO_LONGLONG
373 mlib_s32 o64_1, o64_2;
374#else /* _NO_LONGLONG */
375 mlib_s64 o64;
376#endif /* _NO_LONGLONG */
377#endif /* __sparc */
378 d64_2x32 dd;
379
380 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
381 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
382
383 LOAD_BUFF(buffi);
384
385 dd.d64 = *(FTYPE *)(buffi + i);
386 buff3[i + dx_l ] = (FTYPE)dd.i32s.i0;
387 buff3[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
388
389#ifndef __sparc
390
391 d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
392 d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
393
394 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
395 s1 = p03 * k0 + p13 * k3 + p23 * k6;
396
397 dp[0 ] = FROM_S32(d0);
398 dp[chan1] = FROM_S32(d1);
399
400#else /* __sparc */
401
402 dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
403 dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
404 *(FTYPE *)(buffo + i) = dd.d64;
405
406 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
407 s1 = p03 * k0 + p13 * k3 + p23 * k6;
408
409#ifdef _NO_LONGLONG
410
411 o64_1 = buffo[i];
412 o64_2 = buffo[i+1];
413#if IMG_TYPE != 1
414 STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
415#else
416 STORE2(o64_1 >> 24, o64_2 >> 24);
417#endif /* IMG_TYPE != 1 */
418
419#else /* _NO_LONGLONG */
420
421 o64 = *(mlib_s64*)(buffo + i);
422#if IMG_TYPE != 1
423 STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
424#else
425 STORE2(o64 >> 56, o64 >> 24);
426#endif /* IMG_TYPE != 1 */
427#endif /* _NO_LONGLONG */
428#endif /* __sparc */
429
430 sp += chan2;
431 dp += chan2;
432 }
433
434 for (; i < wid; i++) {
435 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i];
436 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
437 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
438
439 buffi[i] = (mlib_s32)sp[0];
440 buff3[i + dx_l] = (FTYPE)buffi[i];
441
442#ifndef __sparc
443
444 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
445 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
446
447 dp[0] = FROM_S32(d0);
448
449#else /* __sparc */
450
451 buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
452 p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
453#if IMG_TYPE != 1
454 dp[0] = FROM_S32(buffo[i]);
455#else
456 dp[0] = buffo[i] >> 24;
457#endif /* IMG_TYPE != 1 */
458#endif /* __sparc */
459
460 sp += chan1;
461 dp += chan1;
462 }
463
464 for (; i < swid; i++) {
465 buffi[i] = (mlib_s32)sp[0];
466 buff3[i + dx_l] = (FTYPE)buffi[i];
467 sp += chan1;
468 }
469
470 for (i = 0; i < dx_l; i++) buff3[i] = buff3[dx_l];
471 for (i = 0; i < dx_r; i++) buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
472
473 if (j < hgt - dy_b - 2) sl += sll;
474 dl += dll;
475
476 buffT = buff0;
477 buff0 = buff1;
478 buff1 = buff2;
479 buff2 = buff3;
480 buff3 = buffT;
481 }
482 }
483
484#ifdef __sparc
485#if IMG_TYPE == 1
486 {
487 mlib_s32 amask = (1 << nchannel) - 1;
488
489 if ((cmask & amask) != amask) {
490 mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
491 } else {
492 mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
493 }
494 }
495
496#endif /* IMG_TYPE == 1 */
497#endif /* __sparc */
498
499 if (pbuff != buff) mlib_free(pbuff);
500
501 return MLIB_SUCCESS;
502}
503
504/***************************************************************/
505#ifndef __sparc /* for x86, using integer multiplies is faster */
506
507mlib_status CONV_FUNC_I(3x3)
508{
509 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2, *sp_1, *sp_2;
510 DTYPE *adr_dst, *dl, *dp;
511 mlib_s32 wid, hgt, sll, dll;
512 mlib_s32 nchannel, chan1, chan2, delta_chan;
513 mlib_s32 i, j, c;
514 mlib_s32 shift1, shift2;
515 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
516 mlib_s32 p02, p03,
517 p12, p13,
518 p22, p23;
519
520#if IMG_TYPE != 1
521 shift1 = 16;
522#else
523 shift1 = 8;
524#endif /* IMG_TYPE != 1 */
525
526 shift2 = scalef_expon - shift1;
527
528 /* keep kernel in regs */
529 k0 = kern[0] >> shift1; k1 = kern[1] >> shift1; k2 = kern[2] >> shift1;
530 k3 = kern[3] >> shift1; k4 = kern[4] >> shift1; k5 = kern[5] >> shift1;
531 k6 = kern[6] >> shift1; k7 = kern[7] >> shift1; k8 = kern[8] >> shift1;
532
533 GET_SRC_DST_PARAMETERS(DTYPE);
534
535 chan1 = nchannel;
536 chan2 = chan1 + chan1;
537 delta_chan = 0;
538
539 if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan = chan1;
540
541 for (c = 0; c < chan1; c++) {
542 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
543
544 sl = adr_src + c;
545 dl = adr_dst + c;
546
547 sp_1 = sl;
548
549 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
550 sp_2 = sl;
551
552 if ((hgt - dy_b) > 0) sl += sll;
553
554 for (j = 0; j < hgt; j++) {
555 mlib_s32 s0, s1;
556 mlib_s32 pix0, pix1;
557
558 dp = dl;
559 sp0 = sp_1;
560 sp_1 = sp_2;
561 sp_2 = sl;
562
563 sp1 = sp_1;
564 sp2 = sp_2;
565
566 p02 = sp0[0];
567 p12 = sp1[0];
568 p22 = sp2[0];
569
570 p03 = sp0[delta_chan];
571 p13 = sp1[delta_chan];
572 p23 = sp2[delta_chan];
573
574 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
575 s1 = p03 * k0 + p13 * k3 + p23 * k6;
576
577 sp0 += (chan1 + delta_chan);
578 sp1 += (chan1 + delta_chan);
579 sp2 += (chan1 + delta_chan);
580
581#ifdef __SUNPRO_C
582#pragma pipeloop(0)
583#endif /* __SUNPRO_C */
584 for (i = 0; i <= (wid - dx_r - 2); i += 2) {
585 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
586 p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
587
588 pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
589 pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
590 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
591
592 CLAMP_STORE(dp[0], pix0)
593 CLAMP_STORE(dp[chan1], pix1)
594
595 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
596 s1 = p03 * k0 + p13 * k3 + p23 * k6;
597
598 sp0 += chan2;
599 sp1 += chan2;
600 sp2 += chan2;
601 dp += chan2;
602 }
603
604 p02 = p03; p12 = p13; p22 = p23;
605
606 for (; i < wid - dx_r; i++) {
607 p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
608 pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
609 CLAMP_STORE(dp[0], pix0)
610 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
611 p02 = p03; p12 = p13; p22 = p23;
612 sp0 += chan1;
613 sp1 += chan1;
614 sp2 += chan1;
615 dp += chan1;
616 }
617
618 sp0 -= chan1;
619 sp1 -= chan1;
620 sp2 -= chan1;
621
622 for (; i < wid; i++) {
623 p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
624 pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
625 CLAMP_STORE(dp[0], pix0)
626 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
627 p02 = p03; p12 = p13; p22 = p23;
628 dp += chan1;
629 }
630
631 if (j < hgt - dy_b - 1) sl += sll;
632 dl += dll;
633 }
634 }
635
636 return MLIB_SUCCESS;
637}
638
639#endif /* __sparc ( for x86, using integer multiplies is faster ) */
640
641/***************************************************************/
642#undef KSIZE
643#define KSIZE 4
644
645mlib_status CONV_FUNC(4x4)
646{
647 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
648 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
649 FTYPE k[KSIZE*KSIZE];
650 mlib_s32 d0, d1;
651 FTYPE k0, k1, k2, k3, k4, k5, k6, k7;
652 FTYPE p00, p01, p02, p03, p04,
653 p10, p11, p12, p13, p14,
654 p20, p21, p22, p23,
655 p30, p31, p32, p33;
656 DEF_VARS(DTYPE);
657 DTYPE *sl2, *sl3;
658 LOAD_KERNEL(KSIZE*KSIZE);
659 GET_SRC_DST_PARAMETERS(DTYPE);
660
661 swid = wid + KSIZE1;
662
663 if (swid > BUFF_LINE) {
664 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE )*swid);
665
666 if (pbuff == NULL) return MLIB_FAILURE;
667 }
668
669 buff0 = pbuff;
670 buff1 = buff0 + swid;
671 buff2 = buff1 + swid;
672 buff3 = buff2 + swid;
673 buff4 = buff3 + swid;
674 buffd = buff4 + swid;
675 buffo = (mlib_s32*)(buffd + swid);
676 buffi = buffo + (swid &~ 1);
677
678 swid -= (dx_l + dx_r);
679
680 chan1 = nchannel;
681 chan2 = chan1 + chan1;
682
683 for (c = 0; c < nchannel; c++) {
684 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
685
686 sl = adr_src + c;
687 dl = adr_dst + c;
688
689 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
690 else sl1 = sl;
691
692 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
693 else sl2 = sl1;
694
695 if ((hgt - dy_b) > 0) sl3 = sl2 + sll;
696 else sl3 = sl2;
697
698 for (i = 0; i < dx_l; i++) {
699 buff0[i] = (FTYPE)sl[0];
700 buff1[i] = (FTYPE)sl1[0];
701 buff2[i] = (FTYPE)sl2[0];
702 buff3[i] = (FTYPE)sl3[0];
703 }
704
705#ifdef __SUNPRO_C
706#pragma pipeloop(0)
707#endif /* __SUNPRO_C */
708 for (i = 0; i < swid; i++) {
709 buff0[i + dx_l] = (FTYPE)sl[i*chan1];
710 buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
711 buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
712 buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
713 }
714
715 for (i = 0; i < dx_r; i++) {
716 buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
717 buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
718 buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
719 buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
720 }
721
722 if ((hgt - dy_b) > 1) sl = sl3 + sll;
723 else sl = sl3;
724
725 for (j = 0; j < hgt; j++) {
726 d64_2x32 dd;
727
728 /*
729 * First loop on two first lines of kernel
730 */
731 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
732 k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
733
734 sp = sl;
735 dp = dl;
736
737 p02 = buff0[0];
738 p12 = buff1[0];
739 p03 = buff0[1];
740 p13 = buff1[1];
741 p04 = buff0[2];
742
743#ifdef __SUNPRO_C
744#pragma pipeloop(0)
745#endif /* __SUNPRO_C */
746 for (i = 0; i <= (wid - 2); i += 2) {
747 p00 = p02; p10 = p12;
748 p01 = p03; p11 = p13;
749 p02 = p04; p12 = buff1[i + 2];
750 p03 = buff0[i + 3]; p13 = buff1[i + 3];
751 p04 = buff0[i + 4]; p14 = buff1[i + 4];
752
753 LOAD_BUFF(buffi);
754
755 dd.d64 = *(FTYPE *)(buffi + i);
756 buff4[i + dx_l ] = (FTYPE)dd.i32s.i0;
757 buff4[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
758
759 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
760 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
761 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
762 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
763
764 sp += chan2;
765 }
766
767 /*
768 * Second loop on two last lines of kernel
769 */
770 k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
771 k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
772
773 p02 = buff2[0];
774 p12 = buff3[0];
775 p03 = buff2[1];
776 p13 = buff3[1];
777 p04 = buff2[2];
778
779#ifdef __SUNPRO_C
780#pragma pipeloop(0)
781#endif /* __SUNPRO_C */
782 for (i = 0; i <= (wid - 2); i += 2) {
783 p00 = p02; p10 = p12;
784 p01 = p03; p11 = p13;
785 p02 = p04; p12 = buff3[i + 2];
786 p03 = buff2[i + 3]; p13 = buff3[i + 3];
787 p04 = buff2[i + 4]; p14 = buff3[i + 4];
788
789 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
790 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
791 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
792 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
793
794 dp[0 ] = FROM_S32(d0);
795 dp[chan1] = FROM_S32(d1);
796
797 dp += chan2;
798 }
799
800 /* last pixels */
801 for (; i < wid; i++) {
802 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
803 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
804 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
805 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
806
807 buff4[i + dx_l] = (FTYPE)sp[0];
808
809 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
810 p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
811 p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
812 p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
813
814 dp[0] = FROM_S32(buffo[i]);
815
816 sp += chan1;
817 dp += chan1;
818 }
819
820 for (; i < swid; i++) {
821 buff4[i + dx_l] = (FTYPE)sp[0];
822 sp += chan1;
823 }
824
825 for (i = 0; i < dx_l; i++) buff4[i] = buff4[dx_l];
826 for (i = 0; i < dx_r; i++) buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
827
828 /* next line */
829
830 if (j < hgt - dy_b - 2) sl += sll;
831 dl += dll;
832
833 buffT = buff0;
834 buff0 = buff1;
835 buff1 = buff2;
836 buff2 = buff3;
837 buff3 = buff4;
838 buff4 = buffT;
839 }
840 }
841
842 if (pbuff != buff) mlib_free(pbuff);
843
844 return MLIB_SUCCESS;
845}
846
847/***************************************************************/
848#undef KSIZE
849#define KSIZE 5
850
851mlib_status CONV_FUNC(5x5)
852{
853 FTYPE buff[(KSIZE + 3)*BUFF_LINE];
854 FTYPE *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
855 FTYPE k[KSIZE*KSIZE];
856 mlib_s32 d0, d1;
857 FTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
858 FTYPE p00, p01, p02, p03, p04, p05,
859 p10, p11, p12, p13, p14, p15,
860 p20, p21, p22, p23, p24,
861 p30, p31, p32, p33, p34,
862 p40, p41, p42, p43, p44;
863 DEF_VARS(DTYPE);
864 DTYPE *sl2, *sl3, *sl4;
865 LOAD_KERNEL(KSIZE*KSIZE);
866 GET_SRC_DST_PARAMETERS(DTYPE);
867
868 swid = wid + KSIZE1;
869
870 if (swid > BUFF_LINE) {
871 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE )*swid);
872
873 if (pbuff == NULL) return MLIB_FAILURE;
874 }
875
876 buff0 = pbuff;
877 buff1 = buff0 + swid;
878 buff2 = buff1 + swid;
879 buff3 = buff2 + swid;
880 buff4 = buff3 + swid;
881 buff5 = buff4 + swid;
882 buffd = buff5 + swid;
883 buffo = (mlib_s32*)(buffd + swid);
884 buffi = buffo + (swid &~ 1);
885
886 swid -= (dx_l + dx_r);
887
888 chan1 = nchannel;
889 chan2 = chan1 + chan1;
890
891 for (c = 0; c < nchannel; c++) {
892 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
893
894 sl = adr_src + c;
895 dl = adr_dst + c;
896
897 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
898 else sl1 = sl;
899
900 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
901 else sl2 = sl1;
902
903 if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
904 else sl3 = sl2;
905
906 if ((hgt - dy_b) > 0) sl4 = sl3 + sll;
907 else sl4 = sl3;
908
909 for (i = 0; i < dx_l; i++) {
910 buff0[i] = (FTYPE)sl[0];
911 buff1[i] = (FTYPE)sl1[0];
912 buff2[i] = (FTYPE)sl2[0];
913 buff3[i] = (FTYPE)sl3[0];
914 buff4[i] = (FTYPE)sl4[0];
915 }
916
917#ifdef __SUNPRO_C
918#pragma pipeloop(0)
919#endif /* __SUNPRO_C */
920 for (i = 0; i < swid; i++) {
921 buff0[i + dx_l] = (FTYPE)sl[i*chan1];
922 buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
923 buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
924 buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
925 buff4[i + dx_l] = (FTYPE)sl4[i*chan1];
926 }
927
928 for (i = 0; i < dx_r; i++) {
929 buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
930 buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
931 buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
932 buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
933 buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
934 }
935
936 if ((hgt - dy_b) > 1) sl = sl4 + sll;
937 else sl = sl4;
938
939 for (j = 0; j < hgt; j++) {
940 d64_2x32 dd;
941
942 /*
943 * First loop
944 */
945 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
946 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
947
948 sp = sl;
949 dp = dl;
950
951 p02 = buff0[0];
952 p12 = buff1[0];
953 p03 = buff0[1];
954 p13 = buff1[1];
955 p04 = buff0[2];
956 p14 = buff1[2];
957
958#ifdef __SUNPRO_C
959#pragma pipeloop(0)
960#endif /* __SUNPRO_C */
961 for (i = 0; i <= (wid - 2); i += 2) {
962 p00 = p02; p10 = p12;
963 p01 = p03; p11 = p13;
964 p02 = p04; p12 = p14;
965
966 LOAD_BUFF(buffi);
967
968 p03 = buff0[i + 3]; p13 = buff1[i + 3];
969 p04 = buff0[i + 4]; p14 = buff1[i + 4];
970 p05 = buff0[i + 5]; p15 = buff1[i + 5];
971
972 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
973 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
974 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
975 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
976
977 sp += chan2;
978 }
979
980 /*
981 * Second loop
982 */
983 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
984 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
985
986 p02 = buff2[0];
987 p12 = buff3[0];
988 p03 = buff2[1];
989 p13 = buff3[1];
990
991#ifdef __SUNPRO_C
992#pragma pipeloop(0)
993#endif /* __SUNPRO_C */
994 for (i = 0; i <= (wid - 2); i += 2) {
995 p00 = p02; p10 = p12;
996 p01 = p03; p11 = p13;
997
998 p02 = buff2[i + 2]; p12 = buff3[i + 2];
999 p03 = buff2[i + 3]; p13 = buff3[i + 3];
1000 p04 = buff2[i + 4]; p14 = buff3[i + 4];
1001 p05 = buff2[i + 5]; p15 = buff3[i + 5];
1002
1003 dd.d64 = *(FTYPE *)(buffi + i);
1004 buff5[i + dx_l ] = (FTYPE)dd.i32s.i0;
1005 buff5[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
1006
1007 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1008 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1009 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1010 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1011 }
1012
1013 /*
1014 * 3 loop
1015 */
1016 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1017
1018 p02 = buff4[0];
1019 p03 = buff4[1];
1020 p04 = buff4[2];
1021 p05 = buff4[3];
1022
1023#ifdef __SUNPRO_C
1024#pragma pipeloop(0)
1025#endif /* __SUNPRO_C */
1026 for (i = 0; i <= (wid - 2); i += 2) {
1027 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1028
1029 p04 = buff4[i + 4]; p05 = buff4[i + 5];
1030
1031 d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
1032 d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
1033
1034 dp[0 ] = FROM_S32(d0);
1035 dp[chan1] = FROM_S32(d1);
1036
1037 dp += chan2;
1038 }
1039
1040 /* last pixels */
1041 for (; i < wid; i++) {
1042 p00 = buff0[i]; p10 = buff1[i]; p20 = buff2[i]; p30 = buff3[i];
1043 p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
1044 p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
1045 p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
1046 p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
1047
1048 p40 = buff4[i]; p41 = buff4[i + 1]; p42 = buff4[i + 2];
1049 p43 = buff4[i + 3]; p44 = buff4[i + 4];
1050
1051 buff5[i + dx_l] = (FTYPE)sp[0];
1052
1053 buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
1054 p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
1055 p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
1056 p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
1057 p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
1058
1059 dp[0] = FROM_S32(buffo[i]);
1060
1061 sp += chan1;
1062 dp += chan1;
1063 }
1064
1065 for (; i < swid; i++) {
1066 buff5[i + dx_l] = (FTYPE)sp[0];
1067 sp += chan1;
1068 }
1069
1070 for (i = 0; i < dx_l; i++) buff5[i] = buff5[dx_l];
1071 for (i = 0; i < dx_r; i++) buff5[swid + dx_l + i] = buff5[swid + dx_l - 1];
1072
1073 /* next line */
1074
1075 if (j < hgt - dy_b - 2) sl += sll;
1076 dl += dll;
1077
1078 buffT = buff0;
1079 buff0 = buff1;
1080 buff1 = buff2;
1081 buff2 = buff3;
1082 buff3 = buff4;
1083 buff4 = buff5;
1084 buff5 = buffT;
1085 }
1086 }
1087
1088 if (pbuff != buff) mlib_free(pbuff);
1089
1090 return MLIB_SUCCESS;
1091}
1092
1093/***************************************************************/
1094#ifndef __sparc /* for x86, using integer multiplies is faster */
1095
1096mlib_status CONV_FUNC_I(5x5)
1097{
1098 mlib_s32 buff[BUFF_LINE];
1099 mlib_s32 *buffd;
1100 mlib_s32 k[KSIZE*KSIZE];
1101 mlib_s32 shift1, shift2;
1102 mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
1103 mlib_s32 p00, p01, p02, p03, p04, p05,
1104 p10, p11, p12, p13, p14, p15;
1105 DTYPE *adr_src, *sl, *sp0, *sp1, *sp2, *sp3, *sp4;
1106 DTYPE *sp_1, *sp_2, *sp_3, *sp_4;
1107 DTYPE *adr_dst, *dl, *dp;
1108 mlib_s32 *pbuff = buff;
1109 mlib_s32 wid, hgt, sll, dll;
1110 mlib_s32 nchannel, chan1, chan2, chan4;
1111 mlib_s32 delta_chan1, delta_chan2, delta_chan3;
1112 mlib_s32 i, j, c;
1113
1114#if IMG_TYPE != 1
1115 shift1 = 16;
1116#else
1117 shift1 = 8;
1118#endif /* IMG_TYPE != 1 */
1119
1120 shift2 = scalef_expon - shift1;
1121
1122 for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
1123
1124 GET_SRC_DST_PARAMETERS(DTYPE);
1125
1126 if (wid > BUFF_LINE) {
1127 pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
1128
1129 if (pbuff == NULL) return MLIB_FAILURE;
1130 }
1131
1132 buffd = pbuff;
1133
1134 chan1 = nchannel;
1135 chan2 = chan1 + chan1;
1136
1137 if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan1 = chan1;
1138 else delta_chan1 = 0;
1139
1140 if ((2 > dx_l) && (2 < wid + KSIZE1 - dx_r)) delta_chan2 = delta_chan1 + chan1;
1141 else delta_chan2 = delta_chan1;
1142
1143 if ((3 > dx_l) && (3 < wid + KSIZE1 - dx_r)) delta_chan3 = delta_chan2 + chan1;
1144 else delta_chan3 = delta_chan2;
1145
1146 chan4 = chan1 + delta_chan3;
1147
1148 for (c = 0; c < chan1; c++) {
1149 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1150
1151 sl = adr_src + c;
1152 dl = adr_dst + c;
1153
1154 sp_1 = sl;
1155
1156 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
1157 sp_2 = sl;
1158
1159 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl += sll;
1160 sp_3 = sl;
1161
1162 if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl += sll;
1163 sp_4 = sl;
1164
1165 if ((hgt - dy_b) > 0) sl += sll;
1166
1167 for (j = 0; j < hgt; j++) {
1168 mlib_s32 pix0, pix1;
1169
1170 dp = dl;
1171 sp0 = sp_1;
1172 sp_1 = sp_2;
1173 sp_2 = sp_3;
1174 sp_3 = sp_4;
1175 sp_4 = sl;
1176
1177 sp1 = sp_1;
1178 sp2 = sp_2;
1179 sp3 = sp_3;
1180 sp4 = sp_4;
1181
1182 /*
1183 * First loop
1184 */
1185
1186 k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
1187 k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
1188
1189 p02 = sp0[0]; p12 = sp1[0];
1190 p03 = sp0[delta_chan1]; p13 = sp1[delta_chan1];
1191 p04 = sp0[delta_chan2]; p14 = sp1[delta_chan2];
1192 p05 = sp0[delta_chan3]; p15 = sp1[delta_chan3];
1193
1194 sp0 += chan4;
1195 sp1 += chan4;
1196
1197#ifdef __SUNPRO_C
1198#pragma pipeloop(0)
1199#endif /* __SUNPRO_C */
1200 for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1201 p00 = p02; p10 = p12;
1202 p01 = p03; p11 = p13;
1203 p02 = p04; p12 = p14;
1204 p03 = p05; p13 = p15;
1205
1206 p04 = sp0[0]; p14 = sp1[0];
1207 p05 = sp0[chan1]; p15 = sp1[chan1];
1208
1209 buffd[i ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1210 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1211 buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1212 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1213
1214 sp0 += chan2;
1215 sp1 += chan2;
1216 }
1217
1218 p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1219 p11 = p12; p12 = p13; p13 = p14; p14 = p15;
1220
1221 for (; i < wid - dx_r; i++) {
1222 p00 = p01; p10 = p11;
1223 p01 = p02; p11 = p12;
1224 p02 = p03; p12 = p13;
1225 p03 = p04; p13 = p14;
1226
1227 p04 = sp0[0]; p14 = sp1[0];
1228
1229 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1230 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1231
1232 sp0 += chan1;
1233 sp1 += chan1;
1234 }
1235
1236 sp0 -= chan1;
1237 sp1 -= chan1;
1238
1239 for (; i < wid; i++) {
1240 p00 = p01; p10 = p11;
1241 p01 = p02; p11 = p12;
1242 p02 = p03; p12 = p13;
1243 p03 = p04; p13 = p14;
1244
1245 p04 = sp0[0]; p14 = sp1[0];
1246
1247 buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1248 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1249 }
1250
1251 /*
1252 * Second loop
1253 */
1254
1255 k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
1256 k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
1257
1258 p02 = sp2[0]; p12 = sp3[0];
1259 p03 = sp2[delta_chan1]; p13 = sp3[delta_chan1];
1260 p04 = sp2[delta_chan2]; p14 = sp3[delta_chan2];
1261 p05 = sp2[delta_chan3]; p15 = sp3[delta_chan3];
1262
1263 sp2 += chan4;
1264 sp3 += chan4;
1265
1266#ifdef __SUNPRO_C
1267#pragma pipeloop(0)
1268#endif /* __SUNPRO_C */
1269 for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1270 p00 = p02; p10 = p12;
1271 p01 = p03; p11 = p13;
1272 p02 = p04; p12 = p14;
1273 p03 = p05; p13 = p15;
1274
1275 p04 = sp2[0]; p14 = sp3[0];
1276 p05 = sp2[chan1]; p15 = sp3[chan1];
1277
1278 buffd[i ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1279 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1280 buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
1281 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
1282
1283 sp2 += chan2;
1284 sp3 += chan2;
1285 }
1286
1287 p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1288 p11 = p12; p12 = p13; p13 = p14; p14 = p15;
1289
1290 for (; i < wid - dx_r; i++) {
1291 p00 = p01; p10 = p11;
1292 p01 = p02; p11 = p12;
1293 p02 = p03; p12 = p13;
1294 p03 = p04; p13 = p14;
1295
1296 p04 = sp2[0]; p14 = sp3[0];
1297
1298 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1299 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1300
1301 sp2 += chan1;
1302 sp3 += chan1;
1303 }
1304
1305 sp2 -= chan1;
1306 sp3 -= chan1;
1307
1308 for (; i < wid; i++) {
1309 p00 = p01; p10 = p11;
1310 p01 = p02; p11 = p12;
1311 p02 = p03; p12 = p13;
1312 p03 = p04; p13 = p14;
1313
1314 p04 = sp2[0]; p14 = sp3[0];
1315
1316 buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
1317 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
1318 }
1319
1320 /*
1321 * 3 loop
1322 */
1323
1324 k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
1325
1326 p02 = sp4[0];
1327 p03 = sp4[delta_chan1];
1328 p04 = sp4[delta_chan2];
1329 p05 = sp4[delta_chan3];
1330
1331 sp4 += chan4;
1332
1333#ifdef __SUNPRO_C
1334#pragma pipeloop(0)
1335#endif /* __SUNPRO_C */
1336 for (i = 0; i <= (wid - dx_r - 2); i += 2) {
1337 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
1338
1339 p04 = sp4[0]; p05 = sp4[chan1];
1340
1341 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1342 p03 * k3 + p04 * k4) >> shift2;
1343 pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
1344 p04 * k3 + p05 * k4) >> shift2;
1345
1346 CLAMP_STORE(dp[0], pix0)
1347 CLAMP_STORE(dp[chan1], pix1)
1348
1349 dp += chan2;
1350 sp4 += chan2;
1351 }
1352
1353 p01 = p02; p02 = p03; p03 = p04; p04 = p05;
1354
1355 for (; i < wid - dx_r; i++) {
1356 p00 = p01; p01 = p02; p02 = p03; p03 = p04;
1357
1358 p04 = sp4[0];
1359
1360 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1361 p03 * k3 + p04 * k4) >> shift2;
1362 CLAMP_STORE(dp[0], pix0)
1363
1364 dp += chan1;
1365 sp4 += chan1;
1366 }
1367
1368 sp4 -= chan1;
1369
1370 for (; i < wid; i++) {
1371 p00 = p01; p01 = p02; p02 = p03; p03 = p04;
1372
1373 p04 = sp4[0];
1374
1375 pix0 = (buffd[i ] + p00 * k0 + p01 * k1 + p02 * k2 +
1376 p03 * k3 + p04 * k4) >> shift2;
1377 CLAMP_STORE(dp[0], pix0)
1378
1379 dp += chan1;
1380 }
1381
1382 /* next line */
1383
1384 if (j < hgt - dy_b - 1) sl += sll;
1385 dl += dll;
1386 }
1387 }
1388
1389 if (pbuff != buff) mlib_free(pbuff);
1390
1391 return MLIB_SUCCESS;
1392}
1393
1394#endif /* __sparc ( for x86, using integer multiplies is faster ) */
1395
1396/***************************************************************/
1397#if IMG_TYPE == 1
1398
1399#undef KSIZE
1400#define KSIZE 7
1401
1402mlib_status CONV_FUNC(7x7)
1403{
1404 FTYPE buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
1405 FTYPE k[KSIZE*KSIZE];
1406 mlib_s32 l, m, buff_ind;
1407 mlib_s32 d0, d1;
1408 FTYPE k0, k1, k2, k3, k4, k5, k6;
1409 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1410 DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
1411 DEF_VARS(DTYPE);
1412 LOAD_KERNEL(KSIZE*KSIZE);
1413 GET_SRC_DST_PARAMETERS(DTYPE);
1414
1415 swid = wid + KSIZE1;
1416
1417 if (wid > BUFF_LINE) {
1418 pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE )*wid);
1419
1420 if (pbuff == NULL) return MLIB_FAILURE;
1421 }
1422
1423 for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*swid;
1424 for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
1425 buffd = buffs[KSIZE] + swid;
1426 buffo = (mlib_s32*)(buffd + swid);
1427 buffi = buffo + (swid &~ 1);
1428
1429 swid -= (dx_l + dx_r);
1430
1431 chan1 = nchannel;
1432 chan2 = chan1 + chan1;
1433
1434 for (c = 0; c < nchannel; c++) {
1435 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1436
1437 sl = adr_src + c;
1438 dl = adr_dst + c;
1439
1440 if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
1441 else sl1 = sl;
1442
1443 if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
1444 else sl2 = sl1;
1445
1446 if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
1447 else sl3 = sl2;
1448
1449 if ((4 > dy_t) && (4 < hgt + KSIZE1 - dy_b)) sl4 = sl3 + sll;
1450 else sl4 = sl3;
1451
1452 if ((5 > dy_t) && (5 < hgt + KSIZE1 - dy_b)) sl5 = sl4 + sll;
1453 else sl5 = sl4;
1454
1455 if ((hgt - dy_b) > 0) sl6 = sl5 + sll;
1456 else sl6 = sl5;
1457
1458 for (i = 0; i < dx_l; i++) {
1459 buffs[0][i] = (FTYPE)sl[0];
1460 buffs[1][i] = (FTYPE)sl1[0];
1461 buffs[2][i] = (FTYPE)sl2[0];
1462 buffs[3][i] = (FTYPE)sl3[0];
1463 buffs[4][i] = (FTYPE)sl4[0];
1464 buffs[5][i] = (FTYPE)sl5[0];
1465 buffs[6][i] = (FTYPE)sl6[0];
1466 }
1467
1468#ifdef __SUNPRO_C
1469#pragma pipeloop(0)
1470#endif /* __SUNPRO_C */
1471 for (i = 0; i < swid; i++) {
1472 buffs[0][i + dx_l] = (FTYPE)sl[i*chan1];
1473 buffs[1][i + dx_l] = (FTYPE)sl1[i*chan1];
1474 buffs[2][i + dx_l] = (FTYPE)sl2[i*chan1];
1475 buffs[3][i + dx_l] = (FTYPE)sl3[i*chan1];
1476 buffs[4][i + dx_l] = (FTYPE)sl4[i*chan1];
1477 buffs[5][i + dx_l] = (FTYPE)sl5[i*chan1];
1478 buffs[6][i + dx_l] = (FTYPE)sl6[i*chan1];
1479 }
1480
1481 for (i = 0; i < dx_r; i++) {
1482 buffs[0][swid + dx_l + i] = buffs[0][swid + dx_l - 1];
1483 buffs[1][swid + dx_l + i] = buffs[1][swid + dx_l - 1];
1484 buffs[2][swid + dx_l + i] = buffs[2][swid + dx_l - 1];
1485 buffs[3][swid + dx_l + i] = buffs[3][swid + dx_l - 1];
1486 buffs[4][swid + dx_l + i] = buffs[4][swid + dx_l - 1];
1487 buffs[5][swid + dx_l + i] = buffs[5][swid + dx_l - 1];
1488 buffs[6][swid + dx_l + i] = buffs[6][swid + dx_l - 1];
1489 }
1490
1491 buff_ind = 0;
1492
1493#ifdef __SUNPRO_C
1494#pragma pipeloop(0)
1495#endif /* __SUNPRO_C */
1496 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1497
1498 if ((hgt - dy_b) > 1) sl = sl6 + sll;
1499 else sl = sl6;
1500
1501 for (j = 0; j < hgt; j++) {
1502 FTYPE **buffc = buffs + buff_ind;
1503 FTYPE *buffn = buffc[KSIZE];
1504 FTYPE *pk = k;
1505
1506 for (l = 0; l < KSIZE; l++) {
1507 FTYPE *buff = buffc[l];
1508 d64_2x32 dd;
1509
1510 sp = sl;
1511 dp = dl;
1512
1513 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1514 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1515
1516 k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
1517 k4 = *pk++; k5 = *pk++; k6 = *pk++;
1518
1519 if (l < (KSIZE - 1)) {
1520#ifdef __SUNPRO_C
1521#pragma pipeloop(0)
1522#endif /* __SUNPRO_C */
1523 for (i = 0; i <= (wid - 2); i += 2) {
1524 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1525
1526 p6 = buff[i + 6]; p7 = buff[i + 7];
1527
1528 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
1529 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
1530 }
1531
1532 } else {
1533#ifdef __SUNPRO_C
1534#pragma pipeloop(0)
1535#endif /* __SUNPRO_C */
1536 for (i = 0; i <= (wid - 2); i += 2) {
1537 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1538
1539 p6 = buff[i + 6]; p7 = buff[i + 7];
1540
1541 LOAD_BUFF(buffi);
1542
1543 dd.d64 = *(FTYPE *)(buffi + i);
1544 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0;
1545 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
1546
1547 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
1548 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
1549
1550 dp[0 ] = FROM_S32(d0);
1551 dp[chan1] = FROM_S32(d1);
1552
1553 buffd[i ] = 0.0;
1554 buffd[i + 1] = 0.0;
1555
1556 sp += chan2;
1557 dp += chan2;
1558 }
1559 }
1560 }
1561
1562 /* last pixels */
1563 for (; i < wid; i++) {
1564 FTYPE *pk = k, s = 0;
1565 mlib_s32 d0;
1566
1567 for (l = 0; l < KSIZE; l++) {
1568 FTYPE *buff = buffc[l] + i;
1569
1570 for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
1571 }
1572
1573 d0 = D2I(s);
1574 dp[0] = FROM_S32(d0);
1575
1576 buffn[i + dx_l] = (FTYPE)sp[0];
1577
1578 sp += chan1;
1579 dp += chan1;
1580 }
1581
1582 for (; i < swid; i++) {
1583 buffn[i + dx_l] = (FTYPE)sp[0];
1584 sp += chan1;
1585 }
1586
1587 for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
1588 for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
1589
1590 /* next line */
1591
1592 if (j < hgt - dy_b - 2) sl += sll;
1593 dl += dll;
1594
1595 buff_ind++;
1596
1597 if (buff_ind >= KSIZE + 1) buff_ind = 0;
1598 }
1599 }
1600
1601 if (pbuff != buff) mlib_free(pbuff);
1602
1603 return MLIB_SUCCESS;
1604}
1605
1606#endif /* IMG_TYPE == 1 */
1607
1608/***************************************************************/
1609#define MAX_KER 7
1610#define MAX_N 15
1611#define BUFF_SIZE 1600
1612#define CACHE_SIZE (64*1024)
1613
1614static mlib_status mlib_ImageConv1xN_ext(mlib_image *dst,
1615 const mlib_image *src,
1616 const mlib_d64 *k,
1617 mlib_s32 n,
1618 mlib_s32 dy_t,
1619 mlib_s32 dy_b,
1620 mlib_s32 cmask)
1621{
1622 DTYPE *adr_src, *sl;
1623 DTYPE *adr_dst, *dl, *dp;
1624 FTYPE buff[BUFF_SIZE];
1625 FTYPE *buffd;
1626 FTYPE *pbuff = buff;
1627 const FTYPE *pk;
1628 FTYPE k0, k1, k2, k3;
1629 FTYPE p0, p1, p2, p3, p4;
1630 FTYPE *sbuff;
1631 mlib_s32 l, k_off, off, bsize;
1632 mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
1633 mlib_s32 d0, d1, ii;
1634 mlib_s32 wid, hgt, sll, dll;
1635 mlib_s32 nchannel;
1636 mlib_s32 i, j, c;
1637 GET_SRC_DST_PARAMETERS(DTYPE);
1638
1639 max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
1640
1641 if (max_hsize < 1) max_hsize = 1;
1642 if (max_hsize > hgt) max_hsize = hgt;
1643
1644 shgt = hgt + (n - 1);
1645 smax_hsize = max_hsize + (n - 1);
1646
1647 bsize = 2 * (smax_hsize + 1);
1648
1649 if (bsize > BUFF_SIZE) {
1650 pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
1651
1652 if (pbuff == NULL) return MLIB_FAILURE;
1653 }
1654
1655 sbuff = pbuff;
1656 buffd = sbuff + smax_hsize;
1657
1658 shgt -= (dy_t + dy_b);
1659 k_off = 0;
1660
1661 for (l = 0; l < hgt; l += hsize) {
1662 hsize = hgt - l;
1663
1664 if (hsize > max_hsize) hsize = max_hsize;
1665
1666 smax_hsize = hsize + (n - 1);
1667
1668 for (c = 0; c < nchannel; c++) {
1669 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
1670
1671 sl = adr_src + c;
1672 dl = adr_dst + c;
1673
1674#ifdef __SUNPRO_C
1675#pragma pipeloop(0)
1676#endif /* __SUNPRO_C */
1677 for (i = 0; i < hsize; i++) buffd[i] = 0.0;
1678
1679 for (j = 0; j < wid; j++) {
1680 FTYPE *buff = sbuff;
1681
1682 for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
1683 sbuff[i - k_off] = (FTYPE)sl[0];
1684 }
1685
1686#ifdef __SUNPRO_C
1687#pragma pipeloop(0)
1688#endif /* __SUNPRO_C */
1689 for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
1690 sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
1691 }
1692
1693 for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
1694 sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
1695 }
1696
1697 pk = k;
1698
1699 for (off = 0; off < (n - 4); off += 4) {
1700
1701 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1702 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1703
1704#ifdef __SUNPRO_C
1705#pragma pipeloop(0)
1706#endif /* __SUNPRO_C */
1707 for (i = 0; i < hsize; i += 2) {
1708 p0 = p2; p1 = p3; p2 = p4;
1709
1710 p3 = buff[i + 3]; p4 = buff[i + 4];
1711
1712 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1713 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1714 }
1715
1716 pk += 4;
1717 buff += 4;
1718 }
1719
1720 dp = dl;
1721 kh = n - off;
1722
1723 if (kh == 4) {
1724 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1725 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1726
1727#ifdef __SUNPRO_C
1728#pragma pipeloop(0)
1729#endif /* __SUNPRO_C */
1730 for (i = 0; i <= (hsize - 2); i += 2) {
1731 p0 = p2; p1 = p3; p2 = p4;
1732
1733 p3 = buff[i + 3]; p4 = buff[i + 4];
1734
1735 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
1736 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1737
1738 dp[0 ] = FROM_S32(d0);
1739 dp[dll] = FROM_S32(d1);
1740
1741 buffd[i ] = 0.0;
1742 buffd[i + 1] = 0.0;
1743
1744 dp += 2*dll;
1745 }
1746
1747 if (i < hsize) {
1748 p0 = p2; p1 = p3; p2 = p4;
1749 p3 = buff[i + 3];
1750 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
1751 dp[0] = FROM_S32(d0);
1752 buffd[i] = 0.0;
1753 }
1754
1755 } else if (kh == 3) {
1756
1757 p2 = buff[0]; p3 = buff[1];
1758 k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
1759
1760#ifdef __SUNPRO_C
1761#pragma pipeloop(0)
1762#endif /* __SUNPRO_C */
1763 for (i = 0; i <= (hsize - 2); i += 2) {
1764 p0 = p2; p1 = p3;
1765
1766 p2 = buff[i + 2]; p3 = buff[i + 3];
1767
1768 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
1769 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1770
1771 dp[0 ] = FROM_S32(d0);
1772 dp[dll] = FROM_S32(d1);
1773
1774 buffd[i ] = 0.0;
1775 buffd[i + 1] = 0.0;
1776
1777 dp += 2*dll;
1778 }
1779
1780 if (i < hsize) {
1781 p0 = p2; p1 = p3;
1782 p2 = buff[i + 2];
1783 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
1784 dp[0] = FROM_S32(d0);
1785
1786 buffd[i] = 0.0;
1787 }
1788
1789 } else if (kh == 2) {
1790
1791 p2 = buff[0];
1792 k0 = pk[0]; k1 = pk[1];
1793
1794#ifdef __SUNPRO_C
1795#pragma pipeloop(0)
1796#endif /* __SUNPRO_C */
1797 for (i = 0; i <= (hsize - 2); i += 2) {
1798 p0 = p2;
1799
1800 p1 = buff[i + 1]; p2 = buff[i + 2];
1801
1802 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]);
1803 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
1804
1805 dp[0 ] = FROM_S32(d0);
1806 dp[dll] = FROM_S32(d1);
1807
1808 buffd[i ] = 0.0;
1809 buffd[i + 1] = 0.0;
1810
1811 dp += 2*dll;
1812 }
1813
1814 if (i < hsize) {
1815 p0 = p2;
1816 p1 = buff[i + 1];
1817 d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
1818 dp[0] = FROM_S32(d0);
1819
1820 buffd[i] = 0.0;
1821 }
1822
1823 } else /* kh == 1 */{
1824
1825 k0 = pk[0];
1826
1827#ifdef __SUNPRO_C
1828#pragma pipeloop(0)
1829#endif /* __SUNPRO_C */
1830 for (i = 0; i <= (hsize - 2); i += 2) {
1831 p0 = buff[i]; p1 = buff[i + 1];
1832
1833 d0 = D2I(p0*k0 + buffd[i ]);
1834 d1 = D2I(p1*k0 + buffd[i + 1]);
1835
1836 dp[0 ] = FROM_S32(d0);
1837 dp[dll] = FROM_S32(d1);
1838
1839 buffd[i ] = 0.0;
1840 buffd[i + 1] = 0.0;
1841
1842 dp += 2*dll;
1843 }
1844
1845 if (i < hsize) {
1846 p0 = buff[i];
1847 d0 = D2I(p0*k0 + buffd[i]);
1848 dp[0] = FROM_S32(d0);
1849
1850 buffd[i] = 0.0;
1851 }
1852 }
1853
1854 /* next line */
1855 sl += nchannel;
1856 dl += nchannel;
1857 }
1858 }
1859
1860 k_off += max_hsize;
1861 adr_dst += max_hsize*dll;
1862 }
1863
1864 if (pbuff != buff) mlib_free(pbuff);
1865
1866 return MLIB_SUCCESS;
1867}
1868
1869/***************************************************************/
1870mlib_status CONV_FUNC_MxN
1871{
1872 DTYPE *adr_src, *sl, *sp;
1873 DTYPE *adr_dst, *dl, *dp;
1874 FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
1875 FTYPE **buffs = buffs_arr, *buffd;
1876 FTYPE akernel[256], *k = akernel, fscale = DSCALE;
1877 FTYPE *pbuff = buff;
1878 FTYPE k0, k1, k2, k3, k4, k5, k6;
1879 FTYPE p0, p1, p2, p3, p4, p5, p6, p7;
1880 mlib_s32 *buffi;
1881 mlib_s32 mn, l, off, kw, bsize, buff_ind;
1882 mlib_s32 d0, d1;
1883 mlib_s32 wid, hgt, sll, dll;
1884 mlib_s32 nchannel, chan1, chan2;
1885 mlib_s32 i, j, c, swid;
1886 d64_2x32 dd;
1887 GET_SRC_DST_PARAMETERS(DTYPE);
1888
1889 if (scale > 30) {
1890 fscale *= 1.0/(1 << 30);
1891 scale -= 30;
1892 }
1893
1894 fscale /= (1 << scale);
1895
1896 mn = m*n;
1897
1898 if (mn > 256) {
1899 k = mlib_malloc(mn*sizeof(mlib_d64));
1900
1901 if (k == NULL) return MLIB_FAILURE;
1902 }
1903
1904 for (i = 0; i < mn; i++) {
1905 k[i] = kernel[i]*fscale;
1906 }
1907
1908 if (m == 1) return mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
1909
1910 swid = wid + (m - 1);
1911
1912 bsize = (n + 3)*swid;
1913
1914 if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
1915 pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
1916
1917 if (pbuff == NULL) return MLIB_FAILURE;
1918 buffs = (FTYPE **)(pbuff + bsize);
1919 }
1920
1921 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
1922 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
1923 buffd = buffs[n] + swid;
1924 buffi = (mlib_s32*)(buffd + swid);
1925
1926 chan1 = nchannel;
1927 chan2 = chan1 + chan1;
1928
1929 swid -= (dx_l + dx_r);
1930
1931 for (c = 0; c < nchannel; c++) {
1932 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
1933
1934 sl = adr_src + c;
1935 dl = adr_dst + c;
1936
1937 for (l = 0; l < n; l++) {
1938 FTYPE *buff = buffs[l];
1939
1940 for (i = 0; i < dx_l; i++) {
1941 buff[i] = (FTYPE)sl[0];
1942 }
1943
1944#ifdef __SUNPRO_C
1945#pragma pipeloop(0)
1946#endif /* __SUNPRO_C */
1947 for (i = 0; i < swid; i++) {
1948 buff[i + dx_l] = (FTYPE)sl[i*chan1];
1949 }
1950
1951 for (i = 0; i < dx_r; i++) {
1952 buff[swid + dx_l + i] = buff[swid + dx_l - 1];
1953 }
1954
1955 if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
1956 }
1957
1958 buff_ind = 0;
1959
1960#ifdef __SUNPRO_C
1961#pragma pipeloop(0)
1962#endif /* __SUNPRO_C */
1963 for (i = 0; i < wid; i++) buffd[i] = 0.0;
1964
1965 for (j = 0; j < hgt; j++) {
1966 FTYPE **buffc = buffs + buff_ind;
1967 FTYPE *buffn = buffc[n];
1968 FTYPE *pk = k;
1969
1970 for (l = 0; l < n; l++) {
1971 FTYPE *buff_l = buffc[l];
1972
1973 for (off = 0; off < m;) {
1974 FTYPE *buff = buff_l + off;
1975
1976 kw = m - off;
1977
1978 if (kw > 2*MAX_KER) kw = MAX_KER; else
1979 if (kw > MAX_KER) kw = kw/2;
1980 off += kw;
1981
1982 sp = sl;
1983 dp = dl;
1984
1985 if (kw == 7) {
1986
1987 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
1988 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
1989
1990 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
1991 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
1992
1993 if (l < (n - 1) || off < m) {
1994#ifdef __SUNPRO_C
1995#pragma pipeloop(0)
1996#endif /* __SUNPRO_C */
1997 for (i = 0; i <= (wid - 2); i += 2) {
1998 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
1999
2000 p6 = buff[i + 6]; p7 = buff[i + 7];
2001
2002 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2003 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2004 }
2005
2006 } else {
2007#ifdef __SUNPRO_C
2008#pragma pipeloop(0)
2009#endif /* __SUNPRO_C */
2010 for (i = 0; i <= (wid - 2); i += 2) {
2011 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2012
2013 p6 = buff[i + 6]; p7 = buff[i + 7];
2014
2015 LOAD_BUFF(buffi);
2016
2017 dd.d64 = *(FTYPE *)(buffi + i);
2018 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0;
2019 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2020
2021 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
2022 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2023
2024 dp[0 ] = FROM_S32(d0);
2025 dp[chan1] = FROM_S32(d1);
2026
2027 buffd[i ] = 0.0;
2028 buffd[i + 1] = 0.0;
2029
2030 sp += chan2;
2031 dp += chan2;
2032 }
2033 }
2034
2035 } else if (kw == 6) {
2036
2037 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2038 p5 = buff[3]; p6 = buff[4];
2039
2040 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2041 k4 = pk[4]; k5 = pk[5];
2042
2043 if (l < (n - 1) || off < m) {
2044#ifdef __SUNPRO_C
2045#pragma pipeloop(0)
2046#endif /* __SUNPRO_C */
2047 for (i = 0; i <= (wid - 2); i += 2) {
2048 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2049
2050 p5 = buff[i + 5]; p6 = buff[i + 6];
2051
2052 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2053 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2054 }
2055
2056 } else {
2057#ifdef __SUNPRO_C
2058#pragma pipeloop(0)
2059#endif /* __SUNPRO_C */
2060 for (i = 0; i <= (wid - 2); i += 2) {
2061 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2062
2063 p5 = buff[i + 5]; p6 = buff[i + 6];
2064
2065 LOAD_BUFF(buffi);
2066
2067 dd.d64 = *(FTYPE *)(buffi + i);
2068 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0;
2069 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2070
2071 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
2072 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2073
2074 dp[0 ] = FROM_S32(d0);
2075 dp[chan1] = FROM_S32(d1);
2076
2077 buffd[i ] = 0.0;
2078 buffd[i + 1] = 0.0;
2079
2080 sp += chan2;
2081 dp += chan2;
2082 }
2083 }
2084
2085 } else if (kw == 5) {
2086
2087 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2088 p5 = buff[3];
2089
2090 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2091 k4 = pk[4];
2092
2093 if (l < (n - 1) || off < m) {
2094#ifdef __SUNPRO_C
2095#pragma pipeloop(0)
2096#endif /* __SUNPRO_C */
2097 for (i = 0; i <= (wid - 2); i += 2) {
2098 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2099
2100 p4 = buff[i + 4]; p5 = buff[i + 5];
2101
2102 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2103 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2104 }
2105
2106 } else {
2107#ifdef __SUNPRO_C
2108#pragma pipeloop(0)
2109#endif /* __SUNPRO_C */
2110 for (i = 0; i <= (wid - 2); i += 2) {
2111 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2112
2113 p4 = buff[i + 4]; p5 = buff[i + 5];
2114
2115 LOAD_BUFF(buffi);
2116
2117 dd.d64 = *(FTYPE *)(buffi + i);
2118 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0;
2119 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2120
2121 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
2122 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2123
2124 dp[0 ] = FROM_S32(d0);
2125 dp[chan1] = FROM_S32(d1);
2126
2127 buffd[i ] = 0.0;
2128 buffd[i + 1] = 0.0;
2129
2130 sp += chan2;
2131 dp += chan2;
2132 }
2133 }
2134
2135 } else if (kw == 4) {
2136
2137 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2138
2139 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2140
2141 if (l < (n - 1) || off < m) {
2142#ifdef __SUNPRO_C
2143#pragma pipeloop(0)
2144#endif /* __SUNPRO_C */
2145 for (i = 0; i <= (wid - 2); i += 2) {
2146 p0 = p2; p1 = p3; p2 = p4;
2147
2148 p3 = buff[i + 3]; p4 = buff[i + 4];
2149
2150 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2151 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2152 }
2153
2154 } else {
2155#ifdef __SUNPRO_C
2156#pragma pipeloop(0)
2157#endif /* __SUNPRO_C */
2158 for (i = 0; i <= (wid - 2); i += 2) {
2159 p0 = p2; p1 = p3; p2 = p4;
2160
2161 p3 = buff[i + 3]; p4 = buff[i + 4];
2162
2163 LOAD_BUFF(buffi);
2164
2165 dd.d64 = *(FTYPE *)(buffi + i);
2166 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0;
2167 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2168
2169 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
2170 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2171
2172 dp[0 ] = FROM_S32(d0);
2173 dp[chan1] = FROM_S32(d1);
2174
2175 buffd[i ] = 0.0;
2176 buffd[i + 1] = 0.0;
2177
2178 sp += chan2;
2179 dp += chan2;
2180 }
2181 }
2182
2183 } else if (kw == 3) {
2184
2185 p2 = buff[0]; p3 = buff[1];
2186 k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
2187
2188 if (l < (n - 1) || off < m) {
2189#ifdef __SUNPRO_C
2190#pragma pipeloop(0)
2191#endif /* __SUNPRO_C */
2192 for (i = 0; i <= (wid - 2); i += 2) {
2193 p0 = p2; p1 = p3;
2194
2195 p2 = buff[i + 2]; p3 = buff[i + 3];
2196
2197 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
2198 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2199 }
2200
2201 } else {
2202#ifdef __SUNPRO_C
2203#pragma pipeloop(0)
2204#endif /* __SUNPRO_C */
2205 for (i = 0; i <= (wid - 2); i += 2) {
2206 p0 = p2; p1 = p3;
2207
2208 p2 = buff[i + 2]; p3 = buff[i + 3];
2209
2210 LOAD_BUFF(buffi);
2211
2212 dd.d64 = *(FTYPE *)(buffi + i);
2213 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0;
2214 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2215
2216 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
2217 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2218
2219 dp[0 ] = FROM_S32(d0);
2220 dp[chan1] = FROM_S32(d1);
2221
2222 buffd[i ] = 0.0;
2223 buffd[i + 1] = 0.0;
2224
2225 sp += chan2;
2226 dp += chan2;
2227 }
2228 }
2229
2230 } else /* if (kw == 2) */ {
2231
2232 p2 = buff[0];
2233 k0 = pk[0]; k1 = pk[1];
2234
2235 if (l < (n - 1) || off < m) {
2236#ifdef __SUNPRO_C
2237#pragma pipeloop(0)
2238#endif /* __SUNPRO_C */
2239 for (i = 0; i <= (wid - 2); i += 2) {
2240 p0 = p2;
2241
2242 p1 = buff[i + 1]; p2 = buff[i + 2];
2243
2244 buffd[i ] += p0*k0 + p1*k1;
2245 buffd[i + 1] += p1*k0 + p2*k1;
2246 }
2247
2248 } else {
2249#ifdef __SUNPRO_C
2250#pragma pipeloop(0)
2251#endif /* __SUNPRO_C */
2252 for (i = 0; i <= (wid - 2); i += 2) {
2253 p0 = p2;
2254
2255 p1 = buff[i + 1]; p2 = buff[i + 2];
2256
2257 LOAD_BUFF(buffi);
2258
2259 dd.d64 = *(FTYPE *)(buffi + i);
2260 buffn[i + dx_l ] = (FTYPE)dd.i32s.i0;
2261 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
2262
2263 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]);
2264 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
2265
2266 dp[0 ] = FROM_S32(d0);
2267 dp[chan1] = FROM_S32(d1);
2268
2269 buffd[i ] = 0.0;
2270 buffd[i + 1] = 0.0;
2271
2272 sp += chan2;
2273 dp += chan2;
2274 }
2275 }
2276 }
2277
2278 pk += kw;
2279 }
2280 }
2281
2282 /* last pixels */
2283 for (; i < wid; i++) {
2284 FTYPE *pk = k, s = 0;
2285 mlib_s32 x, d0;
2286
2287 for (l = 0; l < n; l++) {
2288 FTYPE *buff = buffc[l] + i;
2289
2290 for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2291 }
2292
2293 d0 = D2I(s);
2294 dp[0] = FROM_S32(d0);
2295
2296 buffn[i + dx_l] = (FTYPE)sp[0];
2297
2298 sp += chan1;
2299 dp += chan1;
2300 }
2301
2302 for (; i < swid; i++) {
2303 buffn[i + dx_l] = (FTYPE)sp[0];
2304 sp += chan1;
2305 }
2306
2307 for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
2308 for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
2309
2310 /* next line */
2311
2312 if (j < hgt - dy_b - 2) sl += sll;
2313 dl += dll;
2314
2315 buff_ind++;
2316
2317 if (buff_ind >= n + 1) buff_ind = 0;
2318 }
2319 }
2320
2321 if (pbuff != buff) mlib_free(pbuff);
2322
2323 return MLIB_SUCCESS;
2324}
2325
2326/***************************************************************/
2327#ifndef __sparc /* for x86, using integer multiplies is faster */
2328
2329#define STORE_RES(res, x) \
2330 x >>= shift2; \
2331 CLAMP_STORE(res, x)
2332
2333mlib_status CONV_FUNC_MxN_I
2334{
2335 DTYPE *adr_src, *sl, *sp;
2336 DTYPE *adr_dst, *dl, *dp;
2337 mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
2338 mlib_s32 *pbuff = buff;
2339 mlib_s32 **buffs = buffs_arr, *buffd;
2340 mlib_s32 l, off, kw, bsize, buff_ind;
2341 mlib_s32 d0, d1, shift1, shift2;
2342 mlib_s32 k0, k1, k2, k3, k4, k5, k6;
2343 mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
2344 mlib_s32 wid, hgt, sll, dll;
2345 mlib_s32 nchannel, chan1;
2346 mlib_s32 i, j, c, swid;
2347 mlib_s32 chan2;
2348 mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
2349 GET_SRC_DST_PARAMETERS(DTYPE);
2350
2351#if IMG_TYPE != 1
2352 shift1 = 16;
2353#else
2354 shift1 = 8;
2355#endif /* IMG_TYPE != 1 */
2356 shift2 = scale - shift1;
2357
2358 chan1 = nchannel;
2359 chan2 = chan1 + chan1;
2360
2361 swid = wid + (m - 1);
2362
2363 bsize = (n + 2)*swid;
2364
2365 if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
2366 pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
2367
2368 if (pbuff == NULL) return MLIB_FAILURE;
2369 buffs = (mlib_s32 **)(pbuff + bsize);
2370 }
2371
2372 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
2373 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
2374 buffd = buffs[n] + swid;
2375
2376 if (m*n > MAX_N*MAX_N) {
2377 k = mlib_malloc(sizeof(mlib_s32)*(m*n));
2378
2379 if (k == NULL) {
2380 if (pbuff != buff) mlib_free(pbuff);
2381 return MLIB_FAILURE;
2382 }
2383 }
2384
2385 for (i = 0; i < m*n; i++) {
2386 k[i] = kernel[i] >> shift1;
2387 }
2388
2389 swid -= (dx_l + dx_r);
2390
2391 for (c = 0; c < nchannel; c++) {
2392 if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
2393
2394 sl = adr_src + c;
2395 dl = adr_dst + c;
2396
2397 for (l = 0; l < n; l++) {
2398 mlib_s32 *buff = buffs[l];
2399
2400 for (i = 0; i < dx_l; i++) {
2401 buff[i] = (mlib_s32)sl[0];
2402 }
2403
2404#ifdef __SUNPRO_C
2405#pragma pipeloop(0)
2406#endif /* __SUNPRO_C */
2407 for (i = 0; i < swid; i++) {
2408 buff[i + dx_l] = (mlib_s32)sl[i*chan1];
2409 }
2410
2411 for (i = 0; i < dx_r; i++) {
2412 buff[swid + dx_l + i] = buff[swid + dx_l - 1];
2413 }
2414
2415 if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
2416 }
2417
2418 buff_ind = 0;
2419
2420#ifdef __SUNPRO_C
2421#pragma pipeloop(0)
2422#endif /* __SUNPRO_C */
2423 for (i = 0; i < wid; i++) buffd[i] = 0;
2424
2425 for (j = 0; j < hgt; j++) {
2426 mlib_s32 **buffc = buffs + buff_ind;
2427 mlib_s32 *buffn = buffc[n];
2428 mlib_s32 *pk = k;
2429
2430 for (l = 0; l < n; l++) {
2431 mlib_s32 *buff_l = buffc[l];
2432
2433 for (off = 0; off < m;) {
2434 mlib_s32 *buff = buff_l + off;
2435
2436 sp = sl;
2437 dp = dl;
2438
2439 kw = m - off;
2440
2441 if (kw > 2*MAX_KER) kw = MAX_KER; else
2442 if (kw > MAX_KER) kw = kw/2;
2443 off += kw;
2444
2445 if (kw == 7) {
2446
2447 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2448 p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
2449
2450 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2451 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
2452
2453 if (l < (n - 1) || off < m) {
2454#ifdef __SUNPRO_C
2455#pragma pipeloop(0)
2456#endif /* __SUNPRO_C */
2457 for (i = 0; i <= (wid - 2); i += 2) {
2458 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2459
2460 p6 = buff[i + 6]; p7 = buff[i + 7];
2461
2462 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
2463 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
2464 }
2465
2466 } else {
2467#ifdef __SUNPRO_C
2468#pragma pipeloop(0)
2469#endif /* __SUNPRO_C */
2470 for (i = 0; i <= (wid - 2); i += 2) {
2471 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
2472
2473 p6 = buff[i + 6]; p7 = buff[i + 7];
2474
2475 buffn[i + dx_l ] = (mlib_s32)sp[0];
2476 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2477
2478 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);
2479 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
2480
2481 STORE_RES(dp[0 ], d0);
2482 STORE_RES(dp[chan1], d1);
2483
2484 buffd[i ] = 0;
2485 buffd[i + 1] = 0;
2486
2487 sp += chan2;
2488 dp += chan2;
2489 }
2490 }
2491
2492 } else if (kw == 6) {
2493
2494 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2495 p5 = buff[3]; p6 = buff[4];
2496
2497 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2498 k4 = pk[4]; k5 = pk[5];
2499
2500 if (l < (n - 1) || off < m) {
2501#ifdef __SUNPRO_C
2502#pragma pipeloop(0)
2503#endif /* __SUNPRO_C */
2504 for (i = 0; i <= (wid - 2); i += 2) {
2505 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2506
2507 p5 = buff[i + 5]; p6 = buff[i + 6];
2508
2509 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
2510 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
2511 }
2512
2513 } else {
2514#ifdef __SUNPRO_C
2515#pragma pipeloop(0)
2516#endif /* __SUNPRO_C */
2517 for (i = 0; i <= (wid - 2); i += 2) {
2518 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
2519
2520 p5 = buff[i + 5]; p6 = buff[i + 6];
2521
2522 buffn[i + dx_l ] = (mlib_s32)sp[0];
2523 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2524
2525 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);
2526 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
2527
2528 STORE_RES(dp[0 ], d0);
2529 STORE_RES(dp[chan1], d1);
2530
2531 buffd[i ] = 0;
2532 buffd[i + 1] = 0;
2533
2534 sp += chan2;
2535 dp += chan2;
2536 }
2537 }
2538
2539 } else if (kw == 5) {
2540
2541 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2542 p5 = buff[3];
2543
2544 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2545 k4 = pk[4];
2546
2547 if (l < (n - 1) || off < m) {
2548#ifdef __SUNPRO_C
2549#pragma pipeloop(0)
2550#endif /* __SUNPRO_C */
2551 for (i = 0; i <= (wid - 2); i += 2) {
2552 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2553
2554 p4 = buff[i + 4]; p5 = buff[i + 5];
2555
2556 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
2557 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
2558 }
2559
2560 } else {
2561#ifdef __SUNPRO_C
2562#pragma pipeloop(0)
2563#endif /* __SUNPRO_C */
2564 for (i = 0; i <= (wid - 2); i += 2) {
2565 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
2566
2567 p4 = buff[i + 4]; p5 = buff[i + 5];
2568
2569 buffn[i + dx_l ] = (mlib_s32)sp[0];
2570 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2571
2572 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);
2573 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
2574
2575 STORE_RES(dp[0 ], d0);
2576 STORE_RES(dp[chan1], d1);
2577
2578 buffd[i ] = 0;
2579 buffd[i + 1] = 0;
2580
2581 sp += chan2;
2582 dp += chan2;
2583 }
2584 }
2585
2586 } else if (kw == 4) {
2587
2588 p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
2589
2590 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
2591
2592 if (l < (n - 1) || off < m) {
2593#ifdef __SUNPRO_C
2594#pragma pipeloop(0)
2595#endif /* __SUNPRO_C */
2596 for (i = 0; i <= (wid - 2); i += 2) {
2597 p0 = p2; p1 = p3; p2 = p4;
2598
2599 p3 = buff[i + 3]; p4 = buff[i + 4];
2600
2601 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
2602 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
2603 }
2604
2605 } else {
2606#ifdef __SUNPRO_C
2607#pragma pipeloop(0)
2608#endif /* __SUNPRO_C */
2609 for (i = 0; i <= (wid - 2); i += 2) {
2610 p0 = p2; p1 = p3; p2 = p4;
2611
2612 p3 = buff[i + 3]; p4 = buff[i + 4];
2613
2614 buffn[i + dx_l ] = (mlib_s32)sp[0];
2615 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2616
2617 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);
2618 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
2619
2620 STORE_RES(dp[0 ], d0);
2621 STORE_RES(dp[chan1], d1);
2622
2623 buffd[i ] = 0;
2624 buffd[i + 1] = 0;
2625
2626 sp += chan2;
2627 dp += chan2;
2628 }
2629 }
2630
2631 } else if (kw == 3) {
2632
2633 p2 = buff[0]; p3 = buff[1];
2634 k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
2635
2636 if (l < (n - 1) || off < m) {
2637#ifdef __SUNPRO_C
2638#pragma pipeloop(0)
2639#endif /* __SUNPRO_C */
2640 for (i = 0; i <= (wid - 2); i += 2) {
2641 p0 = p2; p1 = p3;
2642
2643 p2 = buff[i + 2]; p3 = buff[i + 3];
2644
2645 buffd[i ] += p0*k0 + p1*k1 + p2*k2;
2646 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
2647 }
2648
2649 } else {
2650#ifdef __SUNPRO_C
2651#pragma pipeloop(0)
2652#endif /* __SUNPRO_C */
2653 for (i = 0; i <= (wid - 2); i += 2) {
2654 p0 = p2; p1 = p3;
2655
2656 p2 = buff[i + 2]; p3 = buff[i + 3];
2657
2658 buffn[i + dx_l ] = (mlib_s32)sp[0];
2659 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2660
2661 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]);
2662 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
2663
2664 STORE_RES(dp[0 ], d0);
2665 STORE_RES(dp[chan1], d1);
2666
2667 buffd[i ] = 0;
2668 buffd[i + 1] = 0;
2669
2670 sp += chan2;
2671 dp += chan2;
2672 }
2673 }
2674
2675 } else if (kw == 2) {
2676
2677 p2 = buff[0];
2678 k0 = pk[0]; k1 = pk[1];
2679
2680 if (l < (n - 1) || off < m) {
2681#ifdef __SUNPRO_C
2682#pragma pipeloop(0)
2683#endif /* __SUNPRO_C */
2684 for (i = 0; i <= (wid - 2); i += 2) {
2685 p0 = p2;
2686
2687 p1 = buff[i + 1]; p2 = buff[i + 2];
2688
2689 buffd[i ] += p0*k0 + p1*k1;
2690 buffd[i + 1] += p1*k0 + p2*k1;
2691 }
2692
2693 } else {
2694#ifdef __SUNPRO_C
2695#pragma pipeloop(0)
2696#endif /* __SUNPRO_C */
2697 for (i = 0; i <= (wid - 2); i += 2) {
2698 p0 = p2;
2699
2700 p1 = buff[i + 1]; p2 = buff[i + 2];
2701
2702 buffn[i + dx_l ] = (mlib_s32)sp[0];
2703 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2704
2705 d0 = (p0*k0 + p1*k1 + buffd[i ]);
2706 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
2707
2708 STORE_RES(dp[0 ], d0);
2709 STORE_RES(dp[chan1], d1);
2710
2711 buffd[i ] = 0;
2712 buffd[i + 1] = 0;
2713
2714 sp += chan2;
2715 dp += chan2;
2716 }
2717 }
2718
2719 } else /* kw == 1 */{
2720
2721 k0 = pk[0];
2722
2723 if (l < (n - 1) || off < m) {
2724#ifdef __SUNPRO_C
2725#pragma pipeloop(0)
2726#endif /* __SUNPRO_C */
2727 for (i = 0; i <= (wid - 2); i += 2) {
2728 p0 = buff[i]; p1 = buff[i + 1];
2729
2730 buffd[i ] += p0*k0;
2731 buffd[i + 1] += p1*k0;
2732 }
2733
2734 } else {
2735#ifdef __SUNPRO_C
2736#pragma pipeloop(0)
2737#endif /* __SUNPRO_C */
2738 for (i = 0; i <= (wid - 2); i += 2) {
2739 p0 = buff[i]; p1 = buff[i + 1];
2740
2741 buffn[i + dx_l ] = (mlib_s32)sp[0];
2742 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
2743
2744 d0 = (p0*k0 + buffd[i ]);
2745 d1 = (p1*k0 + buffd[i + 1]);
2746
2747 STORE_RES(dp[0 ], d0);
2748 STORE_RES(dp[chan1], d1);
2749
2750 buffd[i ] = 0;
2751 buffd[i + 1] = 0;
2752
2753 sp += chan2;
2754 dp += chan2;
2755 }
2756 }
2757 }
2758
2759 pk += kw;
2760 }
2761 }
2762
2763 /* last pixels */
2764 for (; i < wid; i++) {
2765 mlib_s32 *pk = k, x, s = 0;
2766
2767 for (l = 0; l < n; l++) {
2768 mlib_s32 *buff = buffc[l] + i;
2769
2770 for (x = 0; x < m; x++) s += buff[x] * (*pk++);
2771 }
2772
2773 STORE_RES(dp[0], s);
2774
2775 buffn[i + dx_l] = (mlib_s32)sp[0];
2776
2777 sp += chan1;
2778 dp += chan1;
2779 }
2780
2781 for (; i < swid; i++) {
2782 buffn[i + dx_l] = (mlib_s32)sp[0];
2783 sp += chan1;
2784 }
2785
2786 for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
2787 for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
2788
2789 /* next line */
2790
2791 if (j < hgt - dy_b - 2) sl += sll;
2792 dl += dll;
2793
2794 buff_ind++;
2795
2796 if (buff_ind >= n + 1) buff_ind = 0;
2797 }
2798 }
2799
2800 if (pbuff != buff) mlib_free(pbuff);
2801 if (k != k_locl) mlib_free(k);
2802
2803 return MLIB_SUCCESS;
2804}
2805
2806#endif /* __sparc ( for x86, using integer multiplies is faster ) */
2807
2808/***************************************************************/