blob: 3b0c388e7dac463c37ae039d4cbc3f71263010cc [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2003 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26
27/*
28 * FUNCTION
29 * Internal functions for mlib_ImageConv* on D64/F32 type and
30 * MLIB_EDGE_DST_NO_WRITE mask
31 *
32 */
33
34#include "mlib_image.h"
35#include "mlib_ImageConv.h"
36
37/***************************************************************/
38/*
39 This define switches between functions of MLIB_DOUBLE and MLIB_FLOAT types:
40 Files mlib_ImageConv_D64nw.c and mlib_ImageConv_F32nw.c
41*/
42
43#define TYPE_DOUBLE
44
45/***************************************************************/
46#ifdef TYPE_DOUBLE
47
48#define CONV_FUNC(KERN) mlib_conv##KERN##nw_d64
49
50#define DTYPE mlib_d64
51
52#else
53
54#define CONV_FUNC(KERN) mlib_conv##KERN##nw_f32
55
56#define DTYPE mlib_f32
57
58#endif /* TYPE_DOUBLE */
59
60/***************************************************************/
61#define GET_SRC_DST_PARAMETERS(type) \
62 mlib_s32 hgt = mlib_ImageGetHeight(src); \
63 mlib_s32 wid = mlib_ImageGetWidth(src); \
64 mlib_s32 sll = mlib_ImageGetStride(src) / sizeof(type); \
65 mlib_s32 dll = mlib_ImageGetStride(dst) / sizeof(type); \
66 type* adr_src = mlib_ImageGetData(src); \
67 type* adr_dst = mlib_ImageGetData(dst); \
68 mlib_s32 chan1 = mlib_ImageGetChannels(src)
69
70/***************************************************************/
71#define DEF_VARS(type) \
72 GET_SRC_DST_PARAMETERS(type); \
73 type *sl; \
74 type *dl, *dp; \
75 mlib_s32 i, j, c
76
77/***************************************************************/
78#undef KSIZE
79#define KSIZE 2
80
81mlib_status CONV_FUNC(2x2)(mlib_image *dst,
82 const mlib_image *src,
83 const mlib_d64 *kern,
84 mlib_s32 cmask)
85{
86 DEF_VARS(DTYPE);
87 DTYPE *sp0, *sp1;
88 mlib_s32 chan2 = chan1 + chan1;
89 mlib_s32 chan3 = chan1 + chan2;
90 mlib_s32 chan4 = chan3 + chan1;
91 DTYPE k0, k1, k2, k3;
92 DTYPE p00, p01, p02, p03, p04,
93 p10, p11, p12, p13, p14;
94
95 /* keep kernel in regs */
96 k0 = (DTYPE)kern[0]; k1 = (DTYPE)kern[1];
97 k2 = (DTYPE)kern[2]; k3 = (DTYPE)kern[3];
98
99 wid -= (KSIZE - 1);
100 hgt -= (KSIZE - 1);
101
102 for (c = 0; c < chan1; c++) {
103 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
104
105 dl = adr_dst + c;
106 sl = adr_src + c;
107
108 for (j = 0; j < hgt; j++) {
109 dp = dl;
110 sp0 = sl;
111 sp1 = sp0 + sll;
112
113 p04 = sp0[0];
114 p14 = sp1[0];
115
116 sp0 += chan1;
117 sp1 += chan1;
118
119#ifdef __SUNPRO_C
120#pragma pipeloop(0)
121#endif /* __SUNPRO_C */
122 for (i = 0; i <= (wid - 4); i += 4) {
123 p00 = p04; p10 = p14;
124
125 p01 = sp0[0]; p11 = sp1[0];
126 p02 = sp0[chan1]; p12 = sp1[chan1];
127 p03 = sp0[chan2]; p13 = sp1[chan2];
128 p04 = sp0[chan3]; p14 = sp1[chan3];
129
130 dp[0 ] = p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3;
131 dp[chan1] = p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3;
132 dp[chan2] = p02 * k0 + p03 * k1 + p12 * k2 + p13 * k3;
133 dp[chan3] = p03 * k0 + p04 * k1 + p13 * k2 + p14 * k3;
134
135 dp += chan4;
136 sp0 += chan4;
137 sp1 += chan4;
138 }
139
140 if (i < wid) {
141 p00 = p04; p10 = p14;
142 p01 = sp0[0]; p11 = sp1[0];
143 dp[0] = p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3;
144
145 if ((i + 1) < wid) {
146 p02 = sp0[chan1]; p12 = sp1[chan1];
147 dp[chan1] = p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3;
148
149 if ((i + 2) < wid) {
150 p03 = sp0[chan2]; p13 = sp1[chan2];
151 dp[chan2] = p02 * k0 + p03 * k1 + p12 * k2 + p13 * k3;
152 }
153 }
154 }
155
156 sl += sll;
157 dl += dll;
158 }
159 }
160
161 return MLIB_SUCCESS;
162}
163
164/***************************************************************/
165#undef KSIZE
166#define KSIZE 3
167
168mlib_status CONV_FUNC(3x3)(mlib_image *dst,
169 const mlib_image *src,
170 const mlib_d64 *kern,
171 mlib_s32 cmask)
172{
173 DEF_VARS(DTYPE);
174 mlib_s32 chan2 = chan1 + chan1;
175 DTYPE *sp0, *sp1;
176 DTYPE *sp2;
177 DTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8;
178 DTYPE p02, p03, p12, p13, p22, p23;
179
180 /* keep kernel in regs */
181 k0 = (DTYPE)kern[0]; k1 = (DTYPE)kern[1]; k2 = (DTYPE)kern[2];
182 k3 = (DTYPE)kern[3]; k4 = (DTYPE)kern[4]; k5 = (DTYPE)kern[5];
183 k6 = (DTYPE)kern[6]; k7 = (DTYPE)kern[7]; k8 = (DTYPE)kern[8];
184
185 wid -= (KSIZE - 1);
186 hgt -= (KSIZE - 1);
187
188 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
189
190 for (c = 0; c < chan1; c++) {
191 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
192
193 sl = adr_src + c;
194 dl = adr_dst + c;
195
196 for (j = 0; j < hgt; j++) {
197 DTYPE s0, s1;
198
199 dp = dl;
200 sp0 = sl;
201 sp1 = sp0 + sll;
202 sp2 = sp1 + sll;
203
204 p02 = sp0[0];
205 p12 = sp1[0];
206 p22 = sp2[0];
207
208 p03 = sp0[chan1];
209 p13 = sp1[chan1];
210 p23 = sp2[chan1];
211
212 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
213 s1 = p03 * k0 + p13 * k3 + p23 * k6;
214
215 sp0 += chan2;
216 sp1 += chan2;
217 sp2 += chan2;
218
219#ifdef __SUNPRO_C
220#pragma pipeloop(0)
221#endif /* __SUNPRO_C */
222 for (i = 0; i <= (wid - 2); i += 2) {
223 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
224 p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
225
226 dp[0 ] = s0 + p02 * k2 + p12 * k5 + p22 * k8;
227 dp[chan1] = s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8;
228
229 s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
230 s1 = p03 * k0 + p13 * k3 + p23 * k6;
231
232 sp0 += chan2;
233 sp1 += chan2;
234 sp2 += chan2;
235 dp += chan2;
236 }
237
238 if (wid & 1) {
239 p02 = sp0[0]; p12 = sp1[0]; p22 = sp2[0];
240 dp[0] = s0 + p02 * k2 + p12 * k5 + p22 * k8;
241 }
242
243 sl += sll;
244 dl += dll;
245 }
246 }
247
248 return MLIB_SUCCESS;
249}
250
251/***************************************************************/
252#undef KSIZE
253#define KSIZE 4
254
255mlib_status CONV_FUNC(4x4)(mlib_image *dst,
256 const mlib_image *src,
257 const mlib_d64 *k,
258 mlib_s32 cmask)
259{
260 DTYPE k0, k1, k2, k3, k4, k5, k6, k7;
261 DTYPE p00, p01, p02, p03, p04,
262 p10, p11, p12, p13, p14;
263 DEF_VARS(DTYPE);
264 DTYPE *sp0, *sp1;
265 mlib_s32 chan2 = chan1 + chan1;
266 mlib_s32 chan3 = chan1 + chan2;
267
268 wid -= (KSIZE - 1);
269 hgt -= (KSIZE - 1);
270
271 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
272
273 for (c = 0; c < chan1; c++) {
274 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
275
276 sl = adr_src + c;
277 dl = adr_dst + c;
278
279 for (j = 0; j < hgt; j++) {
280 /*
281 * First loop on two first lines of kernel
282 */
283 sp0 = sl;
284 sp1 = sp0 + sll;
285 dp = dl;
286
287 k0 = (DTYPE)k[0]; k1 = (DTYPE)k[1]; k2 = (DTYPE)k[2]; k3 = (DTYPE)k[3];
288 k4 = (DTYPE)k[4]; k5 = (DTYPE)k[5]; k6 = (DTYPE)k[6]; k7 = (DTYPE)k[7];
289
290 p02 = sp0[0]; p12 = sp1[0];
291 p03 = sp0[chan1]; p13 = sp1[chan1];
292 p04 = sp0[chan2]; p14 = sp1[chan2];
293
294 sp0 += chan3;
295 sp1 += chan3;
296
297#ifdef __SUNPRO_C
298#pragma pipeloop(0)
299#endif /* __SUNPRO_C */
300 for (i = 0; i <= (wid - 2); i += 2) {
301 p00 = p02; p10 = p12;
302 p01 = p03; p11 = p13;
303 p02 = p04; p12 = p14;
304
305 p03 = sp0[0]; p13 = sp1[0];
306 p04 = sp0[chan1]; p14 = sp1[chan1];
307
308 dp[0 ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
309 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
310 dp[chan1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
311 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
312
313 sp0 += chan2;
314 sp1 += chan2;
315 dp += chan2;
316 }
317
318 if (wid & 1) {
319 p00 = p02; p10 = p12;
320 p01 = p03; p11 = p13;
321 p02 = p04; p12 = p14;
322 p03 = sp0[0]; p13 = sp1[0];
323
324 dp[0] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
325 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
326 }
327
328 /*
329 * Second loop on two last lines of kernel
330 */
331 sp0 = sl + 2*sll;
332 sp1 = sp0 + sll;
333 dp = dl;
334
335 k0 = (DTYPE)k[ 8]; k1 = (DTYPE)k[ 9]; k2 = (DTYPE)k[10]; k3 = (DTYPE)k[11];
336 k4 = (DTYPE)k[12]; k5 = (DTYPE)k[13]; k6 = (DTYPE)k[14]; k7 = (DTYPE)k[15];
337
338 p02 = sp0[0]; p12 = sp1[0];
339 p03 = sp0[chan1]; p13 = sp1[chan1];
340 p04 = sp0[chan2]; p14 = sp1[chan2];
341
342 sp0 += chan3;
343 sp1 += chan3;
344
345#ifdef __SUNPRO_C
346#pragma pipeloop(0)
347#endif /* __SUNPRO_C */
348 for (i = 0; i <= (wid - 2); i += 2) {
349 p00 = p02; p10 = p12;
350 p01 = p03; p11 = p13;
351 p02 = p04; p12 = p14;
352
353 p03 = sp0[0]; p13 = sp1[0];
354 p04 = sp0[chan1]; p14 = sp1[chan1];
355
356 dp[0 ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
357 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
358 dp[chan1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
359 p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
360
361 sp0 += chan2;
362 sp1 += chan2;
363 dp += chan2;
364 }
365
366 if (wid & 1) {
367 p00 = p02; p10 = p12;
368 p01 = p03; p11 = p13;
369 p02 = p04; p12 = p14;
370 p03 = sp0[0]; p13 = sp1[0];
371
372 dp[0] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
373 p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
374 }
375
376 /* next line */
377 sl += sll;
378 dl += dll;
379 }
380 }
381
382 return MLIB_SUCCESS;
383}
384
385/***************************************************************/
386#undef KSIZE
387#define KSIZE 5
388
389mlib_status CONV_FUNC(5x5)(mlib_image *dst,
390 const mlib_image *src,
391 const mlib_d64 *k,
392 mlib_s32 cmask)
393{
394 DTYPE k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
395 DTYPE p00, p01, p02, p03, p04, p05,
396 p10, p11, p12, p13, p14, p15;
397 DEF_VARS(DTYPE);
398 DTYPE *sp0, *sp1;
399 mlib_s32 chan2 = chan1 + chan1;
400 mlib_s32 chan3 = chan1 + chan2;
401 mlib_s32 chan4 = chan3 + chan1;
402
403 wid -= (KSIZE - 1);
404 hgt -= (KSIZE - 1);
405
406 adr_dst += ((KSIZE - 1)/2)*(dll + chan1);
407
408 for (c = 0; c < chan1; c++) {
409 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
410
411 sl = adr_src + c;
412 dl = adr_dst + c;
413
414 for (j = 0; j < hgt; j++) {
415 /*
416 * First loop
417 */
418 sp0 = sl;
419 sp1 = sp0 + sll;
420 dp = dl;
421
422 k0 = (DTYPE)k[0]; k1 = (DTYPE)k[1]; k2 = (DTYPE)k[2]; k3 = (DTYPE)k[3]; k4 = (DTYPE)k[4];
423 k5 = (DTYPE)k[5]; k6 = (DTYPE)k[6]; k7 = (DTYPE)k[7]; k8 = (DTYPE)k[8]; k9 = (DTYPE)k[9];
424
425 p02 = sp0[0]; p12 = sp1[0];
426 p03 = sp0[chan1]; p13 = sp1[chan1];
427 p04 = sp0[chan2]; p14 = sp1[chan2];
428 p05 = sp0[chan3]; p15 = sp1[chan3];
429
430 sp0 += chan4;
431 sp1 += chan4;
432
433#ifdef __SUNPRO_C
434#pragma pipeloop(0)
435#endif /* __SUNPRO_C */
436 for (i = 0; i <= (wid - 2); i += 2) {
437 p00 = p02; p10 = p12;
438 p01 = p03; p11 = p13;
439 p02 = p04; p12 = p14;
440 p03 = p05; p13 = p15;
441
442 p04 = sp0[0]; p14 = sp1[0];
443 p05 = sp0[chan1]; p15 = sp1[chan1];
444
445 dp[ 0] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
446 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
447 dp[chan1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
448 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
449
450 sp0 += chan2;
451 sp1 += chan2;
452 dp += chan2;
453 }
454
455 if (wid & 1) {
456 p00 = p02; p10 = p12;
457 p01 = p03; p11 = p13;
458 p02 = p04; p12 = p14;
459 p03 = p05; p13 = p15;
460
461 p04 = sp0[0]; p14 = sp1[0];
462
463 dp[0] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
464 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
465 }
466
467 /*
468 * Second loop
469 */
470 sp0 = sl + 2*sll;
471 sp1 = sp0 + sll;
472 dp = dl;
473
474 k0 = (DTYPE)k[10]; k1 = (DTYPE)k[11]; k2 = (DTYPE)k[12]; k3 = (DTYPE)k[13]; k4 = (DTYPE)k[14];
475 k5 = (DTYPE)k[15]; k6 = (DTYPE)k[16]; k7 = (DTYPE)k[17]; k8 = (DTYPE)k[18]; k9 = (DTYPE)k[19];
476
477 p02 = sp0[0]; p12 = sp1[0];
478 p03 = sp0[chan1]; p13 = sp1[chan1];
479 p04 = sp0[chan2]; p14 = sp1[chan2];
480 p05 = sp0[chan3]; p15 = sp1[chan3];
481
482 sp0 += chan4;
483 sp1 += chan4;
484
485#ifdef __SUNPRO_C
486#pragma pipeloop(0)
487#endif /* __SUNPRO_C */
488 for (i = 0; i <= (wid - 2); i += 2) {
489 p00 = p02; p10 = p12;
490 p01 = p03; p11 = p13;
491 p02 = p04; p12 = p14;
492 p03 = p05; p13 = p15;
493
494 p04 = sp0[0]; p14 = sp1[0];
495 p05 = sp0[chan1]; p15 = sp1[chan1];
496
497 dp[ 0] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
498 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
499 dp[chan1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
500 p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
501
502 sp0 += chan2;
503 sp1 += chan2;
504 dp += chan2;
505 }
506
507 if (wid & 1) {
508 p00 = p02; p10 = p12;
509 p01 = p03; p11 = p13;
510 p02 = p04; p12 = p14;
511 p03 = p05; p13 = p15;
512
513 p04 = sp0[0]; p14 = sp1[0];
514
515 dp[0] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
516 p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
517 }
518
519 /*
520 * 3 loop
521 */
522 dp = dl;
523 sp0 = sl + 4*sll;
524
525 k0 = (DTYPE)k[20]; k1 = (DTYPE)k[21]; k2 = (DTYPE)k[22]; k3 = (DTYPE)k[23]; k4 = (DTYPE)k[24];
526
527 p02 = sp0[0];
528 p03 = sp0[chan1];
529 p04 = sp0[chan2];
530 p05 = sp0[chan3];
531
532 sp0 += chan2 + chan2;
533
534#ifdef __SUNPRO_C
535#pragma pipeloop(0)
536#endif /* __SUNPRO_C */
537 for (i = 0; i <= (wid - 2); i += 2) {
538 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
539
540 p04 = sp0[0]; p05 = sp0[chan1];
541
542 dp[0 ] += p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4;
543 dp[chan1] += p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4;
544
545 dp += chan2;
546 sp0 += chan2;
547 }
548
549 if (wid & 1) {
550 p00 = p02; p01 = p03; p02 = p04; p03 = p05;
551
552 p04 = sp0[0];
553
554 dp[0] += p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4;
555 }
556
557 /* next line */
558 sl += sll;
559 dl += dll;
560 }
561 }
562
563 return MLIB_SUCCESS;
564}
565
566/***************************************************************/
567#define BUFF_SIZE 1600
568
569#define CACHE_SIZE (64*1024)
570
571static mlib_status mlib_ImageConv1xN(mlib_image *dst,
572 const mlib_image *src,
573 const DTYPE *k,
574 mlib_s32 n,
575 mlib_s32 dn,
576 mlib_s32 cmask)
577{
578 DTYPE buff[BUFF_SIZE], *pbuff = buff;
579 const DTYPE *pk;
580 DTYPE k0, k1, k2, k3;
581 DTYPE p0, p1, p2, p3, p4;
582 DTYPE *sp, *sl_c, *dl_c, *sl0;
583 DEF_VARS(DTYPE);
584 mlib_s32 off, kh;
585 mlib_s32 l, hsize, max_hsize;
586
587 hgt -= (n - 1);
588 adr_dst += dn*dll;
589
590 max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
591
592 if (!max_hsize) max_hsize = 1;
593
594 if (max_hsize > BUFF_SIZE) {
595 pbuff = mlib_malloc(sizeof(DTYPE)*max_hsize);
596 }
597
598 sl_c = adr_src;
599 dl_c = adr_dst;
600
601 for (l = 0; l < hgt; l += hsize) {
602 hsize = hgt - l;
603
604 if (hsize > max_hsize) hsize = max_hsize;
605
606 for (c = 0; c < chan1; c++) {
607 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
608
609 sl = sl_c + c;
610 dl = dl_c + c;
611
612#ifdef __SUNPRO_C
613#pragma pipeloop(0)
614#endif /* __SUNPRO_C */
615 for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
616
617 for (i = 0; i < wid; i++) {
618 sl0 = sl;
619
620 for (off = 0; off < (n - 4); off += 4) {
621 pk = k + off;
622 sp = sl0;
623
624 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
625 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
626 sp += 3*sll;
627
628#ifdef __SUNPRO_C
629#pragma pipeloop(0)
630#endif /* __SUNPRO_C */
631 for (j = 0; j < hsize; j += 2) {
632 p0 = p2; p1 = p3; p2 = p4;
633 p3 = sp[0];
634 p4 = sp[sll];
635
636 pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
637 pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
638
639 sp += 2*sll;
640 }
641
642 sl0 += 4*sll;
643 }
644
645 pk = k + off;
646 sp = sl0;
647
648 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
649 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
650
651 dp = dl;
652 kh = n - off;
653
654 if (kh == 4) {
655 sp += 3*sll;
656
657#ifdef __SUNPRO_C
658#pragma pipeloop(0)
659#endif /* __SUNPRO_C */
660 for (j = 0; j <= (hsize - 2); j += 2) {
661 p0 = p2; p1 = p3; p2 = p4;
662 p3 = sp[0];
663 p4 = sp[sll];
664
665 dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j];
666 dp[dll] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1];
667
668 pbuff[j] = 0;
669 pbuff[j + 1] = 0;
670
671 sp += 2*sll;
672 dp += 2*dll;
673 }
674
675 if (j < hsize) {
676 p0 = p2; p1 = p3; p2 = p4;
677 p3 = sp[0];
678
679 dp[0] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j];
680
681 pbuff[j] = 0;
682 }
683
684 } else if (kh == 3) {
685 sp += 2*sll;
686
687#ifdef __SUNPRO_C
688#pragma pipeloop(0)
689#endif /* __SUNPRO_C */
690 for (j = 0; j <= (hsize - 2); j += 2) {
691 p0 = p2; p1 = p3;
692 p2 = sp[0];
693 p3 = sp[sll];
694
695 dp[0 ] = p0*k0 + p1*k1 + p2*k2 + pbuff[j];
696 dp[dll] = p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1];
697
698 pbuff[j] = 0;
699 pbuff[j + 1] = 0;
700
701 sp += 2*sll;
702 dp += 2*dll;
703 }
704
705 if (j < hsize) {
706 p0 = p2; p1 = p3;
707 p2 = sp[0];
708
709 dp[0] = p0*k0 + p1*k1 + p2*k2 + pbuff[j];
710
711 pbuff[j] = 0;
712 }
713
714 } else if (kh == 2) {
715 sp += sll;
716
717#ifdef __SUNPRO_C
718#pragma pipeloop(0)
719#endif /* __SUNPRO_C */
720 for (j = 0; j <= (hsize - 2); j += 2) {
721 p0 = p2;
722 p1 = sp[0];
723 p2 = sp[sll];
724
725 dp[0 ] = p0*k0 + p1*k1 + pbuff[j];
726 dp[dll] = p1*k0 + p2*k1 + pbuff[j + 1];
727
728 pbuff[j] = 0;
729 pbuff[j + 1] = 0;
730
731 sp += 2*sll;
732 dp += 2*dll;
733 }
734
735 if (j < hsize) {
736 p0 = p2;
737 p1 = sp[0];
738
739 dp[0] = p0*k0 + p1*k1 + pbuff[j];
740
741 pbuff[j] = 0;
742 }
743
744 } else /* if (kh == 1) */ {
745#ifdef __SUNPRO_C
746#pragma pipeloop(0)
747#endif /* __SUNPRO_C */
748 for (j = 0; j < hsize; j++) {
749 p0 = sp[0];
750
751 dp[0] = p0*k0 + pbuff[j];
752
753 pbuff[j] = 0;
754
755 sp += sll;
756 dp += dll;
757 }
758 }
759
760 sl += chan1;
761 dl += chan1;
762 }
763 }
764
765 sl_c += max_hsize*sll;
766 dl_c += max_hsize*dll;
767 }
768
769 if (pbuff != buff) mlib_free(pbuff);
770
771 return MLIB_SUCCESS;
772}
773
774/***************************************************************/
775#define MAX_KER 7
776#define MAX_NM 81
777
778mlib_status CONV_FUNC(MxN)(mlib_image *dst,
779 const mlib_image *src,
780 const mlib_d64 *ker,
781 mlib_s32 m,
782 mlib_s32 n,
783 mlib_s32 dm,
784 mlib_s32 dn,
785 mlib_s32 cmask)
786{
787 DTYPE k0, k1, k2, k3, k4, k5, k6, *sp;
788 DTYPE p0, p1, p2, p3, p4, p5, p6, p7;
789 mlib_s32 l, off, kw;
790 DEF_VARS(DTYPE);
791 mlib_s32 chan2 = chan1 + chan1;
792 mlib_s32 chan3 = chan1 + chan2;
793
794#ifdef TYPE_DOUBLE
795 const mlib_d64 *k = ker;
796#else
797 mlib_f32 k_arr[MAX_NM], *k = k_arr;
798
799 if (n*m > MAX_NM) {
800 k = mlib_malloc(n*m*sizeof(mlib_f32));
801
802 if (k == NULL) return MLIB_FAILURE;
803 }
804
805 for (i = 0; i < n*m; i++) k[i] = (mlib_f32)ker[i];
806#endif /* TYPE_DOUBLE */
807
808 if (m == 1) return mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
809
810 wid -= (m - 1);
811 hgt -= (n - 1);
812 adr_dst += dn*dll + dm*chan1;
813
814 for (c = 0; c < chan1; c++) {
815 if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
816
817 sl = adr_src + c;
818 dl = adr_dst + c;
819
820 for (j = 0; j < hgt; j++) {
821 const DTYPE *pk = k;
822
823 for (l = 0; l < n; l++) {
824 DTYPE *sp0 = sl + l*sll;
825
826 for (off = 0; off < m; off += kw, pk += kw, sp0 += chan1) {
827 kw = m - off;
828
829 if (kw > 2*MAX_KER) kw = MAX_KER; else
830 if (kw > MAX_KER) kw = kw/2;
831
832 p2 = sp0[0]; p3 = sp0[chan1]; p4 = sp0[chan2];
833 sp0 += chan3;
834 p5 = sp0[0]; p6 = sp0[chan1]; p7 = sp0[chan2];
835
836 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
837 k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
838
839 dp = dl;
840
841 if (kw == 7) {
842 sp = sp0 += chan3;
843
844 if (pk == k) {
845#ifdef __SUNPRO_C
846#pragma pipeloop(0)
847#endif /* __SUNPRO_C */
848 for (i = 0; i <= (wid - 2); i += 2) {
849 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
850
851 p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1];
852
853 dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
854 dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
855
856 sp += chan2;
857 dp += chan2;
858 }
859
860 } else {
861#ifdef __SUNPRO_C
862#pragma pipeloop(0)
863#endif /* __SUNPRO_C */
864 for (i = 0; i <= (wid - 2); i += 2) {
865 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
866
867 p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1];
868
869 dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
870 dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
871
872 sp += chan2;
873 dp += chan2;
874 }
875 }
876
877 } else if (kw == 6) {
878 sp = sp0 += chan2;
879
880 if (pk == k) {
881#ifdef __SUNPRO_C
882#pragma pipeloop(0)
883#endif /* __SUNPRO_C */
884 for (i = 0; i <= (wid - 2); i += 2) {
885 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
886
887 p5 = sp[0]; p6 = sp[chan1];
888
889 dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
890 dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
891
892 sp += chan2;
893 dp += chan2;
894 }
895
896 } else {
897#ifdef __SUNPRO_C
898#pragma pipeloop(0)
899#endif /* __SUNPRO_C */
900 for (i = 0; i <= (wid - 2); i += 2) {
901 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
902
903 p5 = sp[0]; p6 = sp[chan1];
904
905 dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
906 dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
907
908 sp += chan2;
909 dp += chan2;
910 }
911 }
912
913 } else if (kw == 5) {
914 sp = sp0 += chan1;
915
916 if (pk == k) {
917#ifdef __SUNPRO_C
918#pragma pipeloop(0)
919#endif /* __SUNPRO_C */
920 for (i = 0; i <= (wid - 2); i += 2) {
921 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
922
923 p4 = sp[0]; p5 = sp[chan1];
924
925 dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
926 dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
927
928 sp += chan2;
929 dp += chan2;
930 }
931
932 } else {
933#ifdef __SUNPRO_C
934#pragma pipeloop(0)
935#endif /* __SUNPRO_C */
936 for (i = 0; i <= (wid - 2); i += 2) {
937 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
938
939 p4 = sp[0]; p5 = sp[chan1];
940
941 dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
942 dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
943
944 sp += chan2;
945 dp += chan2;
946 }
947 }
948
949 } else if (kw == 4) {
950
951 sp = sp0;
952
953 if (pk == k) {
954#ifdef __SUNPRO_C
955#pragma pipeloop(0)
956#endif /* __SUNPRO_C */
957 for (i = 0; i <= (wid - 2); i += 2) {
958 p0 = p2; p1 = p3; p2 = p4;
959
960 p3 = sp[0]; p4 = sp[chan1];
961
962 dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3;
963 dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3;
964
965 sp += chan2;
966 dp += chan2;
967 }
968
969 } else {
970#ifdef __SUNPRO_C
971#pragma pipeloop(0)
972#endif /* __SUNPRO_C */
973 for (i = 0; i <= (wid - 2); i += 2) {
974 p0 = p2; p1 = p3; p2 = p4;
975
976 p3 = sp[0]; p4 = sp[chan1];
977
978 dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
979 dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
980
981 sp += chan2;
982 dp += chan2;
983 }
984 }
985
986 } else if (kw == 3) {
987 sp = sp0 -= chan1;
988
989 if (pk == k) {
990#ifdef __SUNPRO_C
991#pragma pipeloop(0)
992#endif /* __SUNPRO_C */
993 for (i = 0; i <= (wid - 2); i += 2) {
994 p0 = p2; p1 = p3;
995
996 p2 = sp[0]; p3 = sp[chan1];
997
998 dp[0 ] = p0*k0 + p1*k1 + p2*k2;
999 dp[chan1] = p1*k0 + p2*k1 + p3*k2;
1000
1001 sp += chan2;
1002 dp += chan2;
1003 }
1004
1005 } else {
1006#ifdef __SUNPRO_C
1007#pragma pipeloop(0)
1008#endif /* __SUNPRO_C */
1009 for (i = 0; i <= (wid - 2); i += 2) {
1010 p0 = p2; p1 = p3;
1011
1012 p2 = sp[0]; p3 = sp[chan1];
1013
1014 dp[0 ] += p0*k0 + p1*k1 + p2*k2;
1015 dp[chan1] += p1*k0 + p2*k1 + p3*k2;
1016
1017 sp += chan2;
1018 dp += chan2;
1019 }
1020 }
1021
1022 } else { /* kw == 2 */
1023 sp = sp0 -= chan2;
1024
1025 if (pk == k) {
1026#ifdef __SUNPRO_C
1027#pragma pipeloop(0)
1028#endif /* __SUNPRO_C */
1029 for (i = 0; i <= (wid - 2); i += 2) {
1030 p0 = p2;
1031
1032 p1 = sp[0]; p2 = sp[chan1];
1033
1034 dp[0 ] = p0*k0 + p1*k1;
1035 dp[chan1] = p1*k0 + p2*k1;
1036
1037 sp += chan2;
1038 dp += chan2;
1039 }
1040
1041 } else {
1042#ifdef __SUNPRO_C
1043#pragma pipeloop(0)
1044#endif /* __SUNPRO_C */
1045 for (i = 0; i <= (wid - 2); i += 2) {
1046 p0 = p2;
1047
1048 p1 = sp[0]; p2 = sp[chan1];
1049
1050 dp[0 ] += p0*k0 + p1*k1;
1051 dp[chan1] += p1*k0 + p2*k1;
1052
1053 sp += chan2;
1054 dp += chan2;
1055 }
1056 }
1057 }
1058 }
1059 }
1060
1061 /* last pixels */
1062
1063 if (wid & 1) {
1064 DTYPE *sp0 = sl + i*chan1, s = 0;
1065 const DTYPE *pk = k;
1066 mlib_s32 x;
1067
1068 for (l = 0; l < n; l++) {
1069 DTYPE *sp = sp0 + l*sll;
1070
1071 for (x = 0; x < m; x++) s += sp[x*chan1] * (*pk++);
1072 }
1073
1074 dp[0] = s;
1075 }
1076
1077 /* next line */
1078 sl += sll;
1079 dl += dll;
1080 }
1081 }
1082
1083#ifndef TYPE_DOUBLE
1084
1085 if (k != k_arr) mlib_free(k);
1086#endif /* TYPE_DOUBLE */
1087
1088 return MLIB_SUCCESS;
1089}
1090
1091/***************************************************************/