blob: 5235d388b3c8c22d6b053d43c938ccc2a76d78d2 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 1998-2003 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26
27
28/*
29 * The functions step along the lines from xLeft to xRight and apply
30 * the bicubic filtering.
31 *
32 */
33
34#include "vis_proto.h"
35#include "mlib_ImageAffine.h"
36#include "mlib_v_ImageFilters.h"
37
38/*#define MLIB_VIS2*/
39
40/***************************************************************/
41#define DTYPE mlib_u8
42
43#define FILTER_BITS 8
44
45/***************************************************************/
46#ifdef MLIB_VIS2
47#define MLIB_WRITE_BMASK(bmask) vis_write_bmask(bmask, 0)
48#else
49#define MLIB_WRITE_BMASK(bmask)
50#endif /* MLIB_VIS2 */
51
52/***************************************************************/
53#define sPtr srcPixelPtr
54
55/***************************************************************/
56#define NEXT_PIXEL_1BC_U8() \
57 xSrc = (X>>MLIB_SHIFT)-1; \
58 ySrc = (Y>>MLIB_SHIFT)-1; \
59 sPtr = (mlib_u8 *)lineAddr[ySrc] + xSrc
60
61/***************************************************************/
62#ifndef MLIB_VIS2
63
64#define ALIGN_ADDR(da, dp) \
65 da = vis_alignaddr(dp, 0)
66
67#else
68
69#define ALIGN_ADDR(da, dp) \
70 vis_alignaddr(dp, 0); \
71 da = (mlib_d64*)(((mlib_addr)(dp)) &~ 7)
72
73#endif /* MLIB_VIS2 */
74
75/***************************************************************/
76#define LOAD_BC_U8_1CH_1PIXEL(mlib_filters_u8) \
77 ALIGN_ADDR(dpSrc, sPtr); \
78 data0 = dpSrc[0]; \
79 data1 = dpSrc[1]; \
80 row00 = vis_faligndata(data0, data1); \
81 sPtr += srcYStride; \
82 ALIGN_ADDR(dpSrc, sPtr); \
83 data0 = dpSrc[0]; \
84 data1 = dpSrc[1]; \
85 row10 = vis_faligndata(data0, data1); \
86 sPtr += srcYStride; \
87 ALIGN_ADDR(dpSrc, sPtr); \
88 data0 = dpSrc[0]; \
89 data1 = dpSrc[1]; \
90 row20 = vis_faligndata(data0, data1); \
91 sPtr += srcYStride; \
92 ALIGN_ADDR(dpSrc, sPtr); \
93 data0 = dpSrc[0]; \
94 data1 = dpSrc[1]; \
95 row30 = vis_faligndata(data0, data1); \
96 filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK; \
97 yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
98 filterposx = (X >> FILTER_SHIFT) & FILTER_MASK; \
99 xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx)); \
100 X += dX; \
101 Y += dY
102
103/***************************************************************/
104#ifndef MLIB_VIS2
105
106#define SUM_4x16(v1, v3) \
107 vis_alignaddr((void*)2, 0); \
108 v0 = vis_faligndata(v3, v3); \
109 v2 = vis_fpadd16(v3, v0); \
110 v1 = vis_write_lo(v1, vis_fpadd16s(vis_read_hi(v2), vis_read_lo(v2)))
111
112#else
113
114#define SUM_4x16(v1, v3) \
115 v2 = vis_freg_pair(vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3)), \
116 vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3))); \
117 v3 = vis_bshuffle(v2, v2); \
118 v1 = vis_write_lo(v1, vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3)))
119
120#endif /* MLIB_VIS2 */
121
122/***************************************************************/
123#define RESULT_1BC_U8_1PIXEL(ind) \
124 v0 = vis_fmul8x16au(vis_read_hi(row0##ind), vis_read_hi(yFilter)); \
125 v1 = vis_fmul8x16al(vis_read_hi(row1##ind), vis_read_hi(yFilter)); \
126 sum = vis_fpadd16(v0, v1); \
127 v2 = vis_fmul8x16au(vis_read_hi(row2##ind), vis_read_lo(yFilter)); \
128 sum = vis_fpadd16(sum, v2); \
129 v3 = vis_fmul8x16al(vis_read_hi(row3##ind), vis_read_lo(yFilter)); \
130 sum = vis_fpadd16(sum, v3); \
131 v0 = vis_fmul8sux16(sum, xFilter); \
132 v1 = vis_fmul8ulx16(sum, xFilter); \
133 v3 = vis_fpadd16(v1, v0); \
134 SUM_4x16(v1, v3); \
135 res = vis_write_lo(res, vis_fpack16(v1))
136
137/***************************************************************/
138#define BC_U8_1CH(index, ind1, ind2, mlib_filters_u8) \
139 ALIGN_ADDR(dpSrc, sPtr); \
140 data0 = dpSrc[0]; \
141 v0 = vis_fmul8x16au(vis_read_hi(row0##ind1), vis_read_hi(yFilter)); \
142 filterposy = (Y >> FILTER_SHIFT); \
143 data1 = dpSrc[1]; \
144 v1 = vis_fmul8x16al(vis_read_hi(row1##ind1), vis_read_hi(yFilter)); \
145 row0##ind2 = vis_faligndata(data0, data1); \
146 filterposx = (X >> FILTER_SHIFT); \
147 sPtr += srcYStride; \
148 ALIGN_ADDR(dpSrc, sPtr); \
149 sum = vis_fpadd16(v0, v1); \
150 data0 = dpSrc[0]; \
151 v2 = vis_fmul8x16au(vis_read_hi(row2##ind1), vis_read_lo(yFilter)); \
152 X += dX; \
153 data1 = dpSrc[1]; \
154 row1##ind2 = vis_faligndata(data0, data1); \
155 sPtr += srcYStride; \
156 ALIGN_ADDR(dpSrc, sPtr); \
157 Y += dY; \
158 sum = vis_fpadd16(sum, v2); \
159 xSrc = (X>>MLIB_SHIFT)-1; \
160 v3 = vis_fmul8x16al(vis_read_hi(row3##ind1), vis_read_lo(yFilter)); \
161 data0 = dpSrc[0]; \
162 ySrc = (Y>>MLIB_SHIFT)-1; \
163 sum = vis_fpadd16(sum, v3); \
164 data1 = dpSrc[1]; \
165 filterposy &= FILTER_MASK; \
166 v0 = vis_fmul8sux16(sum, xFilter); \
167 row2##ind2 = vis_faligndata(data0, data1); \
168 sPtr += srcYStride; \
169 v1 = vis_fmul8ulx16(sum, xFilter); \
170 filterposx &= FILTER_MASK; \
171 ALIGN_ADDR(dpSrc, sPtr); \
172 data0 = dpSrc[0]; \
173 d##index = vis_fpadd16(v0, v1); \
174 data1 = dpSrc[1]; \
175 row3##ind2 = vis_faligndata(data0, data1); \
176 yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
177 xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx)); \
178 sPtr = (mlib_u8 *)lineAddr[ySrc] + xSrc
179
180/***************************************************************/
181#ifndef MLIB_VIS2
182
183#define FADD_1BC_U8() \
184 p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0)); \
185 p1 = vis_fpadd16s(vis_read_hi(d1), vis_read_lo(d1)); \
186 p2 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2)); \
187 p3 = vis_fpadd16s(vis_read_hi(d3), vis_read_lo(d3)); \
188 m02 = vis_fpmerge(p0, p2); \
189 m13 = vis_fpmerge(p1, p3); \
190 m0213 = vis_fpmerge(vis_read_hi(m02), vis_read_hi(m13)); \
191 e0 = vis_fpmerge(vis_read_hi(m0213), vis_read_lo(m0213)); \
192 m0213 = vis_fpmerge(vis_read_lo(m02), vis_read_lo(m13)); \
193 e1 = vis_fpmerge(vis_read_hi(m0213), vis_read_lo(m0213)); \
194 res = vis_fpadd16(e0, e1)
195
196#else
197
198#define FADD_1BC_U8() \
199 v0 = vis_freg_pair(vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0)), \
200 vis_fpadd16s(vis_read_hi(d1), vis_read_lo(d1))); \
201 v1 = vis_freg_pair(vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2)), \
202 vis_fpadd16s(vis_read_hi(d3), vis_read_lo(d3))); \
203 v2 = vis_bshuffle(v0, v0); \
204 v3 = vis_bshuffle(v1, v1); \
205 res = vis_freg_pair(vis_fpadd16s(vis_read_hi(v2), vis_read_lo(v2)), \
206 vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3)))
207
208#endif /* MLIB_VIS2 */
209
210/***************************************************************/
211mlib_status mlib_ImageAffine_u8_1ch_bc (mlib_affine_param *param)
212{
213 DECLAREVAR_BC();
214 mlib_s32 filterposx, filterposy;
215 mlib_d64 data0, data1;
216 mlib_d64 sum;
217 mlib_d64 row00, row10, row20, row30;
218 mlib_d64 row01, row11, row21, row31;
219 mlib_d64 xFilter, yFilter;
220 mlib_d64 v0, v1, v2, v3;
221 mlib_d64 d0, d1, d2, d3;
222#ifndef MLIB_VIS2
223 mlib_f32 p0, p1, p2, p3;
224 mlib_d64 e0, e1;
225 mlib_d64 m02, m13, m0213;
226#endif /* MLIB_VIS2 */
227 mlib_d64 *dpSrc;
228 mlib_s32 align, cols, i;
229 mlib_d64 res;
230 const mlib_s16 *mlib_filters_table;
231
232 if (filter == MLIB_BICUBIC) {
233 mlib_filters_table = mlib_filters_u8_bc;
234 } else {
235 mlib_filters_table = mlib_filters_u8_bc2;
236 }
237
238 for (j = yStart; j <= yFinish; j++) {
239
240 vis_write_gsr(3 << 3);
241 MLIB_WRITE_BMASK(0x0145ABEF);
242
243 CLIP(1);
244
245 cols = xRight - xLeft + 1;
246 align = (4 - ((mlib_addr)dstPixelPtr) & 3) & 3;
247 align = (cols < align)? cols : align;
248
249 for (i = 0; i < align; i++) {
250 NEXT_PIXEL_1BC_U8();
251 LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
252 RESULT_1BC_U8_1PIXEL(0);
253 vis_st_u8(res, dstPixelPtr++);
254 }
255
256 if (i <= cols - 10) {
257
258 NEXT_PIXEL_1BC_U8();
259 LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
260
261 NEXT_PIXEL_1BC_U8();
262
263 BC_U8_1CH(0, 0, 1, mlib_filters_table);
264 BC_U8_1CH(1, 1, 0, mlib_filters_table);
265 BC_U8_1CH(2, 0, 1, mlib_filters_table);
266 BC_U8_1CH(3, 1, 0, mlib_filters_table);
267
268 FADD_1BC_U8();
269
270 BC_U8_1CH(0, 0, 1, mlib_filters_table);
271 BC_U8_1CH(1, 1, 0, mlib_filters_table);
272 BC_U8_1CH(2, 0, 1, mlib_filters_table);
273 BC_U8_1CH(3, 1, 0, mlib_filters_table);
274
275#pragma pipeloop(0)
276 for (; i <= cols - 14; i+=4) {
277 *(mlib_f32*)dstPixelPtr = vis_fpack16(res);
278 FADD_1BC_U8();
279 BC_U8_1CH(0, 0, 1, mlib_filters_table);
280 BC_U8_1CH(1, 1, 0, mlib_filters_table);
281 BC_U8_1CH(2, 0, 1, mlib_filters_table);
282 BC_U8_1CH(3, 1, 0, mlib_filters_table);
283 dstPixelPtr += 4;
284 }
285
286 *(mlib_f32*)dstPixelPtr = vis_fpack16(res);
287 dstPixelPtr += 4;
288 FADD_1BC_U8();
289 *(mlib_f32*)dstPixelPtr = vis_fpack16(res);
290 dstPixelPtr += 4;
291
292 RESULT_1BC_U8_1PIXEL(0);
293 vis_st_u8(res, dstPixelPtr++);
294
295 LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
296 RESULT_1BC_U8_1PIXEL(0);
297 vis_st_u8(res, dstPixelPtr++);
298 i += 10;
299 }
300
301 for (; i < cols; i++) {
302 NEXT_PIXEL_1BC_U8();
303 LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
304 RESULT_1BC_U8_1PIXEL(0);
305 vis_st_u8(res, dstPixelPtr++);
306 }
307 }
308
309 return MLIB_SUCCESS;
310}
311
312/***************************************************************/
313#define FADD_2BC_U8() \
314 d0 = vis_fpadd16(d00, d10); \
315 d1 = vis_fpadd16(d01, d11); \
316 d2 = vis_fpadd16(d02, d12); \
317 d3 = vis_fpadd16(d03, d13); \
318 p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0)); \
319 p1 = vis_fpadd16s(vis_read_hi(d1), vis_read_lo(d1)); \
320 p2 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2)); \
321 p3 = vis_fpadd16s(vis_read_hi(d3), vis_read_lo(d3)); \
322 e0 = vis_freg_pair(p0, p1); \
323 e1 = vis_freg_pair(p2, p3); \
324 res = vis_fpack16_pair(e0, e1)
325
326/***************************************************************/
327#define LOAD_BC_U8_2CH_1PIXEL(mlib_filters_u8) \
328 filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK; \
329 yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
330 filterposx = (X >> FILTER_SHIFT) & FILTER_MASK; \
331 xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx)); \
332 X += dX; \
333 Y += dY; \
334 ALIGN_ADDR(dpSrc, sPtr); \
335 data0 = dpSrc[0]; \
336 data1 = dpSrc[1]; \
337 row0 = vis_faligndata(data0, data1); \
338 sPtr += srcYStride; \
339 ALIGN_ADDR(dpSrc, sPtr); \
340 data0 = dpSrc[0]; \
341 data1 = dpSrc[1]; \
342 row1 = vis_faligndata(data0, data1); \
343 sPtr += srcYStride; \
344 ALIGN_ADDR(dpSrc, sPtr); \
345 data0 = dpSrc[0]; \
346 data1 = dpSrc[1]; \
347 row2 = vis_faligndata(data0, data1); \
348 sPtr += srcYStride; \
349 ALIGN_ADDR(dpSrc, sPtr); \
350 data0 = dpSrc[0]; \
351 data1 = dpSrc[1]; \
352 row3 = vis_faligndata(data0, data1)
353
354/***************************************************************/
355#define NEXT_PIXEL_2BC_U8() \
356 xSrc = (X>>MLIB_SHIFT)-1; \
357 ySrc = (Y>>MLIB_SHIFT)-1; \
358 sPtr = (mlib_u8 *)lineAddr[ySrc] + (xSrc<<1)
359
360/***************************************************************/
361#define RESULT_2BC_U8_1PIXEL() \
362 v00 = vis_fmul8x16au(vis_read_hi(row0), vis_read_hi(yFilter)); \
363 dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter)); \
364 v01 = vis_fmul8x16au(vis_read_lo(row0), vis_read_hi(yFilter)); \
365 dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr)); \
366 v10 = vis_fmul8x16al(vis_read_hi(row1), vis_read_hi(yFilter)); \
367 dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr)); \
368 v11 = vis_fmul8x16al(vis_read_lo(row1), vis_read_hi(yFilter)); \
369 dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr)); \
370 v20 = vis_fmul8x16au(vis_read_hi(row2), vis_read_lo(yFilter)); \
371 xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1)); \
372 v21 = vis_fmul8x16au(vis_read_lo(row2), vis_read_lo(yFilter)); \
373 xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1)); \
374 v30 = vis_fmul8x16al(vis_read_hi(row3), vis_read_lo(yFilter)); \
375 sum0 = vis_fpadd16(v00, v10); \
376 v31 = vis_fmul8x16al(vis_read_lo(row3), vis_read_lo(yFilter)); \
377 sum1 = vis_fpadd16(v01, v11); \
378 sum0 = vis_fpadd16(sum0, v20); \
379 sum1 = vis_fpadd16(sum1, v21); \
380 sum0 = vis_fpadd16(sum0, v30); \
381 sum1 = vis_fpadd16(sum1, v31); \
382 v00 = vis_fmul8sux16(sum0, xFilter0); \
383 v01 = vis_fmul8sux16(sum1, xFilter1); \
384 v10 = vis_fmul8ulx16(sum0, xFilter0); \
385 sum0 = vis_fpadd16(v00, v10); \
386 v11 = vis_fmul8ulx16(sum1, xFilter1); \
387 sum1 = vis_fpadd16(v01, v11); \
388 d0 = vis_fpadd16(sum0, sum1); \
389 v00 = vis_write_lo(v00, vis_fpadd16s(vis_read_hi(d0), \
390 vis_read_lo(d0))); \
391 res = vis_write_lo(res, vis_fpack16(v00))
392
393/***************************************************************/
394#define BC_U8_2CH(index, mlib_filters_u8) \
395 v00 = vis_fmul8x16au(vis_read_hi(row0), vis_read_hi(yFilter)); \
396 dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter)); \
397 v01 = vis_fmul8x16au(vis_read_lo(row0), vis_read_hi(yFilter)); \
398 dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr)); \
399 v10 = vis_fmul8x16al(vis_read_hi(row1), vis_read_hi(yFilter)); \
400 dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr)); \
401 v11 = vis_fmul8x16al(vis_read_lo(row1), vis_read_hi(yFilter)); \
402 dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr)); \
403 v20 = vis_fmul8x16au(vis_read_hi(row2), vis_read_lo(yFilter)); \
404 xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1)); \
405 v21 = vis_fmul8x16au(vis_read_lo(row2), vis_read_lo(yFilter)); \
406 xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1)); \
407 v30 = vis_fmul8x16al(vis_read_hi(row3), vis_read_lo(yFilter)); \
408 v31 = vis_fmul8x16al(vis_read_lo(row3), vis_read_lo(yFilter)); \
409 ALIGN_ADDR(dpSrc, sPtr); \
410 data0 = dpSrc[0]; \
411 sum0 = vis_fpadd16(v00, v10); \
412 filterposy = (Y >> FILTER_SHIFT); \
413 data1 = dpSrc[1]; \
414 row0 = vis_faligndata(data0, data1); \
415 filterposx = (X >> FILTER_SHIFT); \
416 sPtr += srcYStride; \
417 ALIGN_ADDR(dpSrc, sPtr); \
418 data0 = dpSrc[0]; \
419 sum1 = vis_fpadd16(v01, v11); \
420 X += dX; \
421 data1 = dpSrc[1]; \
422 sum0 = vis_fpadd16(sum0, v20); \
423 row1 = vis_faligndata(data0, data1); \
424 sPtr += srcYStride; \
425 ALIGN_ADDR(dpSrc, sPtr); \
426 Y += dY; \
427 sum1 = vis_fpadd16(sum1, v21); \
428 xSrc = (X>>MLIB_SHIFT)-1; \
429 data0 = dpSrc[0]; \
430 ySrc = (Y>>MLIB_SHIFT)-1; \
431 sum0 = vis_fpadd16(sum0, v30); \
432 data1 = dpSrc[1]; \
433 filterposy &= FILTER_MASK; \
434 sum1 = vis_fpadd16(sum1, v31); \
435 v00 = vis_fmul8sux16(sum0, xFilter0); \
436 row2 = vis_faligndata(data0, data1); \
437 v01 = vis_fmul8sux16(sum1, xFilter1); \
438 sPtr += srcYStride; \
439 v10 = vis_fmul8ulx16(sum0, xFilter0); \
440 filterposx &= FILTER_MASK; \
441 ALIGN_ADDR(dpSrc, sPtr); \
442 v11= vis_fmul8ulx16(sum1, xFilter1); \
443 data0 = dpSrc[0]; \
444 d0##index = vis_fpadd16(v00, v10); \
445 data1 = dpSrc[1]; \
446 row3 = vis_faligndata(data0, data1); \
447 yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
448 d1##index = vis_fpadd16(v01, v11); \
449 xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx)); \
450 sPtr = (mlib_u8 *)lineAddr[ySrc] + (xSrc<<1)
451
452/***************************************************************/
453mlib_status mlib_ImageAffine_u8_2ch_bc (mlib_affine_param *param)
454{
455 DECLAREVAR_BC();
456 DTYPE *dstLineEnd;
457 mlib_s32 filterposx, filterposy;
458 mlib_d64 data0, data1;
459 mlib_d64 sum0, sum1;
460 mlib_d64 row0, row1, row2, row3;
461 mlib_f32 p0, p1, p2, p3;
462 mlib_d64 xFilter;
463 mlib_d64 xFilter0, xFilter1, yFilter;
464 mlib_d64 v00, v10, v20, v30;
465 mlib_d64 v01, v11, v21, v31;
466 mlib_d64 d0, d1, d2, d3;
467 mlib_d64 d00, d01, d02, d03;
468 mlib_d64 d10, d11, d12, d13;
469 mlib_d64 e0, e1;
470 mlib_d64 *dpSrc;
471 mlib_s32 cols, i, mask, off;
472 mlib_d64 dr, dr1;
473 mlib_d64 res, *dp;
474 const mlib_s16 *mlib_filters_table;
475
476 if (filter == MLIB_BICUBIC) {
477 mlib_filters_table = mlib_filters_u8_bc;
478 } else {
479 mlib_filters_table = mlib_filters_u8_bc2;
480 }
481
482 for (j = yStart; j <= yFinish; j++) {
483
484 vis_write_gsr(3 << 3);
485
486 CLIP(2);
487 dstLineEnd = (DTYPE*)dstData + 2 * xRight;
488
489 cols = xRight - xLeft + 1;
490 dp = vis_alignaddr(dstPixelPtr, 0);
491 off = dstPixelPtr - (mlib_u8*)dp;
492 dstLineEnd += 1;
493 mask = vis_edge8(dstPixelPtr, dstLineEnd);
494 i = 0;
495
496 if (i <= cols - 10) {
497
498 NEXT_PIXEL_2BC_U8();
499 LOAD_BC_U8_2CH_1PIXEL(mlib_filters_table);
500
501 NEXT_PIXEL_2BC_U8();
502
503 BC_U8_2CH(0, mlib_filters_table);
504 BC_U8_2CH(1, mlib_filters_table);
505 BC_U8_2CH(2, mlib_filters_table);
506 BC_U8_2CH(3, mlib_filters_table);
507
508 FADD_2BC_U8();
509
510 BC_U8_2CH(0, mlib_filters_table);
511 BC_U8_2CH(1, mlib_filters_table);
512 BC_U8_2CH(2, mlib_filters_table);
513 BC_U8_2CH(3, mlib_filters_table);
514
515#pragma pipeloop(0)
516 for (; i <= cols-14; i+=4) {
517 vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
518 res = vis_faligndata(res, res);
519 vis_pst_8(res, dp++, mask);
520 vis_pst_8(res, dp, ~mask);
521 FADD_2BC_U8();
522 BC_U8_2CH(0, mlib_filters_table);
523 BC_U8_2CH(1, mlib_filters_table);
524 BC_U8_2CH(2, mlib_filters_table);
525 BC_U8_2CH(3, mlib_filters_table);
526 }
527
528 vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
529 res = vis_faligndata(res, res);
530 vis_pst_8(res, dp++, mask);
531 vis_pst_8(res, dp, ~mask);
532
533 FADD_2BC_U8();
534 vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
535 res = vis_faligndata(res, res);
536 vis_pst_8(res, dp++, mask);
537 vis_pst_8(res, dp, ~mask);
538
539 dstPixelPtr = (mlib_u8*)dp + off;
540
541 RESULT_2BC_U8_1PIXEL();
542 vis_alignaddr((void *)7, 0);
543 vis_st_u8(res, dstPixelPtr+1);
544 res = vis_faligndata(res, res);
545 vis_st_u8(res, dstPixelPtr);
546 dstPixelPtr += 2;
547
548 LOAD_BC_U8_2CH_1PIXEL(mlib_filters_table);
549 RESULT_2BC_U8_1PIXEL();
550 vis_alignaddr((void *)7, 0);
551 vis_st_u8(res, dstPixelPtr+1);
552 res = vis_faligndata(res, res);
553 vis_st_u8(res, dstPixelPtr);
554 dstPixelPtr += 2;
555 i += 10;
556 }
557
558 for (; i < cols; i++) {
559 NEXT_PIXEL_2BC_U8();
560 LOAD_BC_U8_2CH_1PIXEL(mlib_filters_table);
561 RESULT_2BC_U8_1PIXEL();
562 vis_alignaddr((void *)7, 0);
563 vis_st_u8(res, dstPixelPtr+1);
564 res = vis_faligndata(res, res);
565 vis_st_u8(res, dstPixelPtr);
566 dstPixelPtr += 2;
567 }
568 }
569
570 return MLIB_SUCCESS;
571}
572
573/***************************************************************/
574#ifndef MLIB_VIS2
575
576#define FADD_3BC_U8() \
577 vis_alignaddr((void*)6, 0); \
578 d3 = vis_faligndata(d0, d1); \
579 vis_alignaddr((void*)2, 0); \
580 d4 = vis_faligndata(d1, d2); \
581 d0 = vis_fpadd16(d0, d3); \
582 d2 = vis_fpadd16(d2, d4); \
583 d1 = vis_faligndata(d2, d2); \
584 d0 = vis_fpadd16(d0, d1); \
585 f0.f = vis_fpack16(d0)
586
587#else
588
589#define FADD_3BC_U8() \
590 vis_alignaddr((void*)4, 0); \
591 d3 = vis_bshuffle(d0, d1); \
592 d1 = vis_faligndata(d1, d2); \
593 d2 = vis_faligndata(d2, d2); \
594 d4 = vis_bshuffle(d1, d2); \
595 d0 = vis_fpadd16(d0, d3); \
596 d1 = vis_fpadd16(d1, d4); \
597 d0 = vis_fpadd16(d0, d1); \
598 f0.f = vis_fpack16(d0)
599
600#endif /* MLIB_VIS2 */
601
602/***************************************************************/
603#define LOAD_BC_U8_3CH_1PIXEL(mlib_filters_u8, mlib_filters_u8_3) \
604 filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK; \
605 yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
606 filterposx = (X >> FILTER_SHIFT) & FILTER_MASK; \
607 xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_3+3*filterposx)); \
608 xFilter0 = xPtr[0]; \
609 xFilter1 = xPtr[1]; \
610 xFilter2 = xPtr[2]; \
611 X += dX; \
612 Y += dY; \
613 ALIGN_ADDR(dpSrc, sPtr); \
614 data0 = dpSrc[0]; \
615 data1 = dpSrc[1]; \
616 data2 = dpSrc[2]; \
617 row00 = vis_faligndata(data0, data1); \
618 row01 = vis_faligndata(data1, data2); \
619 sPtr += srcYStride; \
620 ALIGN_ADDR(dpSrc, sPtr); \
621 data0 = dpSrc[0]; \
622 data1 = dpSrc[1]; \
623 data2 = dpSrc[2]; \
624 row10 = vis_faligndata(data0, data1); \
625 row11 = vis_faligndata(data1, data2); \
626 sPtr += srcYStride; \
627 ALIGN_ADDR(dpSrc, sPtr); \
628 data0 = dpSrc[0]; \
629 data1 = dpSrc[1]; \
630 data2 = dpSrc[2]; \
631 row20 = vis_faligndata(data0, data1); \
632 row21 = vis_faligndata(data1, data2); \
633 sPtr += srcYStride; \
634 ALIGN_ADDR(dpSrc, sPtr); \
635 data0 = dpSrc[0]; \
636 data1 = dpSrc[1]; \
637 data2 = dpSrc[2]; \
638 row30 = vis_faligndata(data0, data1); \
639 row31 = vis_faligndata(data1, data2)
640
641/***************************************************************/
642#define STORE_BC_U8_3CH_1PIXEL() \
643 dstPixelPtr[0] = f0.t[0]; \
644 dstPixelPtr[1] = f0.t[1]; \
645 dstPixelPtr[2] = f0.t[2]; \
646 dstPixelPtr += 3
647
648/***************************************************************/
649#define NEXT_PIXEL_3BC_U8() \
650 xSrc = (X>>MLIB_SHIFT)-1; \
651 ySrc = (Y>>MLIB_SHIFT)-1; \
652 sPtr = (mlib_u8 *)lineAddr[ySrc] + (3*xSrc)
653
654/***************************************************************/
655#define RESULT_3BC_U8_1PIXEL() \
656 v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
657 v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
658 v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
659 v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
660 v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
661 v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
662 v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
663 sum0 = vis_fpadd16(v00, v10); \
664 v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
665 sum1 = vis_fpadd16(v01, v11); \
666 v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
667 sum2 = vis_fpadd16(v02, v12); \
668 v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
669 sum0 = vis_fpadd16(sum0, v20); \
670 v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
671 sum1 = vis_fpadd16(sum1, v21); \
672 v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
673 sum2 = vis_fpadd16(sum2, v22); \
674 sum0 = vis_fpadd16(sum0, v30); \
675 sum1 = vis_fpadd16(sum1, v31); \
676 v00 = vis_fmul8sux16(sum0, xFilter0); \
677 sum2 = vis_fpadd16(sum2, v32); \
678 v01 = vis_fmul8ulx16(sum0, xFilter0); \
679 v10 = vis_fmul8sux16(sum1, xFilter1); \
680 d0 = vis_fpadd16(v00, v01); \
681 v11 = vis_fmul8ulx16(sum1, xFilter1); \
682 v20 = vis_fmul8sux16(sum2, xFilter2); \
683 d1 = vis_fpadd16(v10, v11); \
684 v21 = vis_fmul8ulx16(sum2, xFilter2); \
685 d2 = vis_fpadd16(v20, v21); \
686 FADD_3BC_U8();
687
688/***************************************************************/
689#define BC_U8_3CH(mlib_filters_u8, mlib_filters_u8_3) \
690 v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
691 v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
692 v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
693 ALIGN_ADDR(dpSrc, sPtr); \
694 data0 = dpSrc[0]; \
695 filterposy = (Y >> FILTER_SHIFT); \
696 v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
697 data1 = dpSrc[1]; \
698 v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
699 sum0 = vis_fpadd16(v00, v10); \
700 data2 = dpSrc[2]; \
701 row00 = vis_faligndata(data0, data1); \
702 v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
703 row01 = vis_faligndata(data1, data2); \
704 filterposx = (X >> FILTER_SHIFT); \
705 sPtr += srcYStride; \
706 ALIGN_ADDR(dpSrc, sPtr); \
707 v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
708 sum1 = vis_fpadd16(v01, v11); \
709 data0 = dpSrc[0]; \
710 X += dX; \
711 data1 = dpSrc[1]; \
712 v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
713 sum2 = vis_fpadd16(v02, v12); \
714 data2 = dpSrc[2]; \
715 row10 = vis_faligndata(data0, data1); \
716 v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
717 row11 = vis_faligndata(data1, data2); \
718 sPtr += srcYStride; \
719 ALIGN_ADDR(dpSrc, sPtr); \
720 Y += dY; \
721 xSrc = (X>>MLIB_SHIFT)-1; \
722 v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
723 sum0 = vis_fpadd16(sum0, v20); \
724 data0 = dpSrc[0]; \
725 ySrc = (Y>>MLIB_SHIFT)-1; \
726 data1 = dpSrc[1]; \
727 v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
728 sum1 = vis_fpadd16(sum1, v21); \
729 data2 = dpSrc[2]; \
730 filterposy &= FILTER_MASK; \
731 row20 = vis_faligndata(data0, data1); \
732 v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
733 row21 = vis_faligndata(data1, data2); \
734 sPtr += srcYStride; \
735 filterposx &= FILTER_MASK; \
736 sum2 = vis_fpadd16(sum2, v22); \
737 ALIGN_ADDR(dpSrc, sPtr); \
738 sum0 = vis_fpadd16(sum0, v30); \
739 data0 = dpSrc[0]; \
740 sum1 = vis_fpadd16(sum1, v31); \
741 v00 = vis_fmul8sux16(sum0, xFilter0); \
742 data1 = dpSrc[1]; \
743 sum2 = vis_fpadd16(sum2, v32); \
744 v01 = vis_fmul8ulx16(sum0, xFilter0); \
745 data2 = dpSrc[2]; \
746 row30 = vis_faligndata(data0, data1); \
747 v10 = vis_fmul8sux16(sum1, xFilter1); \
748 d0 = vis_fpadd16(v00, v01); \
749 row31 = vis_faligndata(data1, data2); \
750 yFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposy)); \
751 v11 = vis_fmul8ulx16(sum1, xFilter1); \
752 xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_3+3*filterposx)); \
753 xFilter0 = xPtr[0]; \
754 v20 = vis_fmul8sux16(sum2, xFilter2); \
755 d1 = vis_fpadd16(v10, v11); \
756 xFilter1 = xPtr[1]; \
757 v21 = vis_fmul8ulx16(sum2, xFilter2); \
758 xFilter2 = xPtr[2]; \
759 sPtr = (mlib_u8 *)lineAddr[ySrc] + (3*xSrc); \
760 d2 = vis_fpadd16(v20, v21)
761
762/***************************************************************/
763mlib_status mlib_ImageAffine_u8_3ch_bc (mlib_affine_param *param)
764{
765 DECLAREVAR_BC();
766 mlib_s32 filterposx, filterposy;
767 mlib_d64 data0, data1, data2;
768 mlib_d64 sum0, sum1, sum2;
769 mlib_d64 row00, row10, row20, row30;
770 mlib_d64 row01, row11, row21, row31;
771 mlib_d64 xFilter0, xFilter1, xFilter2, yFilter;
772 mlib_d64 v00, v10, v20, v30;
773 mlib_d64 v01, v11, v21, v31;
774 mlib_d64 v02, v12, v22, v32;
775 mlib_d64 d0, d1, d2, d3, d4;
776 mlib_d64 *dpSrc;
777 mlib_s32 cols, i;
778 mlib_d64 *xPtr;
779 union {
780 mlib_u8 t[4];
781 mlib_f32 f;
782 } f0;
783 const mlib_s16 *mlib_filters_table ;
784 const mlib_s16 *mlib_filters_table_3;
785
786 if (filter == MLIB_BICUBIC) {
787 mlib_filters_table = mlib_filters_u8_bc;
788 mlib_filters_table_3 = mlib_filters_u8_bc_3;
789 } else {
790 mlib_filters_table = mlib_filters_u8_bc2;
791 mlib_filters_table_3 = mlib_filters_u8_bc2_3;
792 }
793
794 vis_write_gsr(3 << 3);
795 MLIB_WRITE_BMASK(0x6789ABCD);
796
797 for (j = yStart; j <= yFinish; j ++) {
798
799 CLIP(3);
800
801 cols = xRight - xLeft + 1;
802 i = 0;
803
804 if (i <= cols - 4) {
805
806 NEXT_PIXEL_3BC_U8();
807 LOAD_BC_U8_3CH_1PIXEL(mlib_filters_table, mlib_filters_table_3);
808
809 NEXT_PIXEL_3BC_U8();
810
811 BC_U8_3CH(mlib_filters_table, mlib_filters_table_3);
812 FADD_3BC_U8();
813
814 BC_U8_3CH(mlib_filters_table, mlib_filters_table_3);
815
816#pragma pipeloop(0)
817 for (; i < cols-4; i++) {
818 STORE_BC_U8_3CH_1PIXEL();
819
820 FADD_3BC_U8();
821 BC_U8_3CH(mlib_filters_table, mlib_filters_table_3);
822 }
823
824 STORE_BC_U8_3CH_1PIXEL();
825
826 FADD_3BC_U8();
827 STORE_BC_U8_3CH_1PIXEL();
828
829 RESULT_3BC_U8_1PIXEL();
830 STORE_BC_U8_3CH_1PIXEL();
831
832 LOAD_BC_U8_3CH_1PIXEL(mlib_filters_table, mlib_filters_table_3);
833 RESULT_3BC_U8_1PIXEL();
834 STORE_BC_U8_3CH_1PIXEL();
835 i += 4;
836 }
837
838 for (; i < cols; i++) {
839 NEXT_PIXEL_3BC_U8();
840 LOAD_BC_U8_3CH_1PIXEL(mlib_filters_table, mlib_filters_table_3);
841 RESULT_3BC_U8_1PIXEL();
842 STORE_BC_U8_3CH_1PIXEL();
843 }
844 }
845
846 return MLIB_SUCCESS;
847}
848
849/***************************************************************/
850#define FADD_4BC_U8() \
851 d0 = vis_fpadd16(d00, d10); \
852 d1 = vis_fpadd16(d20, d30); \
853 d0 = vis_fpadd16(d0, d1); \
854 d2 = vis_fpadd16(d01, d11); \
855 d3 = vis_fpadd16(d21, d31); \
856 d2 = vis_fpadd16(d2, d3); \
857 res = vis_fpack16_pair(d0, d2)
858
859/***************************************************************/
860#define LOAD_BC_U8_4CH_1PIXEL(mlib_filters_u8, mlib_filters_u8_4) \
861 filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK; \
862 yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
863 filterposx = (X >> FILTER_SHIFT) & FILTER_MASK; \
864 xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_4+4*filterposx)); \
865 xFilter0 = xPtr[0]; \
866 xFilter1 = xPtr[1]; \
867 xFilter2 = xPtr[2]; \
868 xFilter3 = xPtr[3]; \
869 X += dX; \
870 Y += dY; \
871 ALIGN_ADDR(dpSrc, sPtr); \
872 data0 = dpSrc[0]; \
873 data1 = dpSrc[1]; \
874 data2 = dpSrc[2]; \
875 row00 = vis_faligndata(data0, data1); \
876 row01 = vis_faligndata(data1, data2); \
877 sPtr += srcYStride; \
878 ALIGN_ADDR(dpSrc, sPtr); \
879 data0 = dpSrc[0]; \
880 data1 = dpSrc[1]; \
881 data2 = dpSrc[2]; \
882 row10 = vis_faligndata(data0, data1); \
883 row11 = vis_faligndata(data1, data2); \
884 sPtr += srcYStride; \
885 ALIGN_ADDR(dpSrc, sPtr); \
886 data0 = dpSrc[0]; \
887 data1 = dpSrc[1]; \
888 data2 = dpSrc[2]; \
889 row20 = vis_faligndata(data0, data1); \
890 row21 = vis_faligndata(data1, data2); \
891 sPtr += srcYStride; \
892 ALIGN_ADDR(dpSrc, sPtr); \
893 data0 = dpSrc[0]; \
894 data1 = dpSrc[1]; \
895 data2 = dpSrc[2]; \
896 row30 = vis_faligndata(data0, data1); \
897 row31 = vis_faligndata(data1, data2)
898
899/***************************************************************/
900#define NEXT_PIXEL_4BC_U8() \
901 xSrc = (X>>MLIB_SHIFT)-1; \
902 ySrc = (Y>>MLIB_SHIFT)-1; \
903 sPtr = (mlib_u8 *)lineAddr[ySrc] + (4*xSrc)
904
905/***************************************************************/
906#define RESULT_4BC_U8_1PIXEL(ind) \
907 v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
908 v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
909 v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
910 v03 = vis_fmul8x16au(vis_read_lo(row01), vis_read_hi(yFilter)); \
911 v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
912 v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
913 sum0 = vis_fpadd16(v00, v10); \
914 v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
915 sum1 = vis_fpadd16(v01, v11); \
916 v13 = vis_fmul8x16al(vis_read_lo(row11), vis_read_hi(yFilter)); \
917 sum2 = vis_fpadd16(v02, v12); \
918 v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
919 sum3 = vis_fpadd16(v03, v13); \
920 v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
921 sum0 = vis_fpadd16(sum0, v20); \
922 v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
923 sum1 = vis_fpadd16(sum1, v21); \
924 v23 = vis_fmul8x16au(vis_read_lo(row21), vis_read_lo(yFilter)); \
925 sum2 = vis_fpadd16(sum2, v22); \
926 v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
927 sum3 = vis_fpadd16(sum3, v23); \
928 v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
929 sum0 = vis_fpadd16(sum0, v30); \
930 v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
931 sum1 = vis_fpadd16(sum1, v31); \
932 v33 = vis_fmul8x16al(vis_read_lo(row31), vis_read_lo(yFilter)); \
933 sum2 = vis_fpadd16(sum2, v32); \
934 v00 = vis_fmul8sux16(sum0, xFilter0); \
935 sum3 = vis_fpadd16(sum3, v33); \
936 v01 = vis_fmul8ulx16(sum0, xFilter0); \
937 v10 = vis_fmul8sux16(sum1, xFilter1); \
938 d0##ind = vis_fpadd16(v00, v01); \
939 v11 = vis_fmul8ulx16(sum1, xFilter1); \
940 v20 = vis_fmul8sux16(sum2, xFilter2); \
941 d1##ind = vis_fpadd16(v10, v11); \
942 v21 = vis_fmul8ulx16(sum2, xFilter2); \
943 v30 = vis_fmul8sux16(sum3, xFilter3); \
944 d2##ind = vis_fpadd16(v20, v21); \
945 v31 = vis_fmul8ulx16(sum3, xFilter3); \
946 d3##ind = vis_fpadd16(v30, v31)
947
948/***************************************************************/
949#define BC_U8_4CH(ind, mlib_filters_u8, mlib_filters_u8_4) \
950 v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
951 v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
952 v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
953 v03 = vis_fmul8x16au(vis_read_lo(row01), vis_read_hi(yFilter)); \
954 ALIGN_ADDR(dpSrc, sPtr); \
955 data0 = dpSrc[0]; \
956 filterposy = (Y >> FILTER_SHIFT); \
957 v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
958 data1 = dpSrc[1]; \
959 v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
960 sum0 = vis_fpadd16(v00, v10); \
961 data2 = dpSrc[2]; \
962 row00 = vis_faligndata(data0, data1); \
963 v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
964 row01 = vis_faligndata(data1, data2); \
965 filterposx = (X >> FILTER_SHIFT); \
966 v13 = vis_fmul8x16al(vis_read_lo(row11), vis_read_hi(yFilter)); \
967 sPtr += srcYStride; \
968 ALIGN_ADDR(dpSrc, sPtr); \
969 v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
970 sum1 = vis_fpadd16(v01, v11); \
971 data0 = dpSrc[0]; \
972 X += dX; \
973 data1 = dpSrc[1]; \
974 v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
975 sum2 = vis_fpadd16(v02, v12); \
976 data2 = dpSrc[2]; \
977 row10 = vis_faligndata(data0, data1); \
978 v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
979 row11 = vis_faligndata(data1, data2); \
980 sPtr += srcYStride; \
981 ALIGN_ADDR(dpSrc, sPtr); \
982 v23 = vis_fmul8x16au(vis_read_lo(row21), vis_read_lo(yFilter)); \
983 sum3 = vis_fpadd16(v03, v13); \
984 Y += dY; \
985 xSrc = (X>>MLIB_SHIFT)-1; \
986 v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
987 sum0 = vis_fpadd16(sum0, v20); \
988 data0 = dpSrc[0]; \
989 ySrc = (Y>>MLIB_SHIFT)-1; \
990 data1 = dpSrc[1]; \
991 v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
992 sum1 = vis_fpadd16(sum1, v21); \
993 data2 = dpSrc[2]; \
994 filterposy &= FILTER_MASK; \
995 row20 = vis_faligndata(data0, data1); \
996 v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
997 row21 = vis_faligndata(data1, data2); \
998 sPtr += srcYStride; \
999 filterposx &= FILTER_MASK; \
1000 v33 = vis_fmul8x16al(vis_read_lo(row31), vis_read_lo(yFilter)); \
1001 sum2 = vis_fpadd16(sum2, v22); \
1002 ALIGN_ADDR(dpSrc, sPtr); \
1003 sum3 = vis_fpadd16(sum3, v23); \
1004 sum0 = vis_fpadd16(sum0, v30); \
1005 data0 = dpSrc[0]; \
1006 sum1 = vis_fpadd16(sum1, v31); \
1007 v00 = vis_fmul8sux16(sum0, xFilter0); \
1008 data1 = dpSrc[1]; \
1009 sum2 = vis_fpadd16(sum2, v32); \
1010 v01 = vis_fmul8ulx16(sum0, xFilter0); \
1011 sum3 = vis_fpadd16(sum3, v33); \
1012 data2 = dpSrc[2]; \
1013 row30 = vis_faligndata(data0, data1); \
1014 v10 = vis_fmul8sux16(sum1, xFilter1); \
1015 d0##ind = vis_fpadd16(v00, v01); \
1016 row31 = vis_faligndata(data1, data2); \
1017 yFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposy)); \
1018 v11 = vis_fmul8ulx16(sum1, xFilter1); \
1019 xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_4+4*filterposx)); \
1020 xFilter0 = xPtr[0]; \
1021 v20 = vis_fmul8sux16(sum2, xFilter2); \
1022 d1##ind = vis_fpadd16(v10, v11); \
1023 xFilter1 = xPtr[1]; \
1024 v21 = vis_fmul8ulx16(sum2, xFilter2); \
1025 xFilter2 = xPtr[2]; \
1026 v30 = vis_fmul8sux16(sum3, xFilter3); \
1027 d2##ind = vis_fpadd16(v20, v21); \
1028 v31 = vis_fmul8ulx16(sum3, xFilter3); \
1029 xFilter3 = xPtr[3]; \
1030 sPtr = (mlib_u8 *)lineAddr[ySrc] + (4*xSrc); \
1031 d3##ind = vis_fpadd16(v30, v31)
1032
1033/***************************************************************/
1034mlib_status mlib_ImageAffine_u8_4ch_bc (mlib_affine_param *param)
1035{
1036 DECLAREVAR_BC();
1037 DTYPE *dstLineEnd;
1038 mlib_s32 filterposx, filterposy;
1039 mlib_d64 data0, data1, data2;
1040 mlib_d64 sum0, sum1, sum2, sum3;
1041 mlib_d64 row00, row10, row20, row30;
1042 mlib_d64 row01, row11, row21, row31;
1043 mlib_d64 xFilter0, xFilter1, xFilter2, xFilter3, yFilter;
1044 mlib_d64 v00, v10, v20, v30;
1045 mlib_d64 v01, v11, v21, v31;
1046 mlib_d64 v02, v12, v22, v32;
1047 mlib_d64 v03, v13, v23, v33;
1048 mlib_d64 d0, d1, d2, d3;
1049 mlib_d64 d00, d10, d20, d30;
1050 mlib_d64 d01, d11, d21, d31;
1051 mlib_d64 *dpSrc;
1052 mlib_s32 cols, i;
1053 mlib_d64 res, *dp, *xPtr;
1054 mlib_s32 mask, emask, gsrd;
1055 const mlib_s16 *mlib_filters_table ;
1056 const mlib_s16 *mlib_filters_table_4;
1057
1058 if (filter == MLIB_BICUBIC) {
1059 mlib_filters_table = mlib_filters_u8_bc;
1060 mlib_filters_table_4 = mlib_filters_u8_bc_4;
1061 } else {
1062 mlib_filters_table = mlib_filters_u8_bc2;
1063 mlib_filters_table_4 = mlib_filters_u8_bc2_4;
1064 }
1065
1066 for (j = yStart; j <= yFinish; j++) {
1067
1068 vis_write_gsr(3 << 3);
1069
1070 CLIP(4);
1071 dstLineEnd = (DTYPE*)dstData + 4 * xRight;
1072 dstLineEnd += 3;
1073 dp = (mlib_d64*)vis_alignaddr(dstPixelPtr, 0);
1074 mask = vis_edge8(dstPixelPtr, dstLineEnd);
1075 gsrd = ((8 - (mlib_addr)dstPixelPtr) & 7);
1076
1077 cols = xRight - xLeft + 1;
1078 i = 0;
1079
1080 if (i <= cols - 6) {
1081
1082 NEXT_PIXEL_4BC_U8();
1083 LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1084
1085 NEXT_PIXEL_4BC_U8();
1086
1087 BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
1088 BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
1089 FADD_4BC_U8();
1090
1091 BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
1092 BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
1093
1094#pragma pipeloop(0)
1095 for (; i <= cols-8; i+=2) {
1096 vis_alignaddr((void *)gsrd, 0);
1097 res = vis_faligndata(res, res);
1098
1099 vis_pst_8(res, dp++, mask);
1100 vis_pst_8(res, dp, ~mask);
1101
1102 FADD_4BC_U8();
1103 BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
1104 BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
1105 }
1106
1107 vis_alignaddr((void *)gsrd, 0);
1108 res = vis_faligndata(res, res);
1109
1110 vis_pst_8(res, dp++, mask);
1111 vis_pst_8(res, dp, ~mask);
1112
1113 FADD_4BC_U8();
1114 vis_alignaddr((void *)gsrd, 0);
1115 res = vis_faligndata(res, res);
1116
1117 vis_pst_8(res, dp++, mask);
1118 vis_pst_8(res, dp, ~mask);
1119
1120 RESULT_4BC_U8_1PIXEL(0);
1121 LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1122 RESULT_4BC_U8_1PIXEL(1);
1123 FADD_4BC_U8();
1124
1125 vis_alignaddr((void *)gsrd, 0);
1126 res = vis_faligndata(res, res);
1127
1128 vis_pst_8(res, dp++, mask);
1129 vis_pst_8(res, dp, ~mask);
1130 i += 6;
1131 }
1132
1133 if (i <= cols-4) {
1134 NEXT_PIXEL_4BC_U8();
1135 LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1136
1137 NEXT_PIXEL_4BC_U8();
1138
1139 BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
1140 BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
1141 FADD_4BC_U8();
1142 vis_alignaddr((void *)gsrd, 0);
1143 res = vis_faligndata(res, res);
1144
1145 vis_pst_8(res, dp++, mask);
1146 vis_pst_8(res, dp, ~mask);
1147
1148 RESULT_4BC_U8_1PIXEL(0);
1149 LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1150 RESULT_4BC_U8_1PIXEL(1);
1151 FADD_4BC_U8();
1152
1153 vis_alignaddr((void *)gsrd, 0);
1154 res = vis_faligndata(res, res);
1155
1156 vis_pst_8(res, dp++, mask);
1157 vis_pst_8(res, dp, ~mask);
1158 i += 4;
1159 }
1160
1161 if (i <= cols-2) {
1162 NEXT_PIXEL_4BC_U8();
1163 LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1164 RESULT_4BC_U8_1PIXEL(0);
1165
1166 NEXT_PIXEL_4BC_U8();
1167 LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1168 RESULT_4BC_U8_1PIXEL(1);
1169 FADD_4BC_U8();
1170
1171 vis_alignaddr((void *)gsrd, 0);
1172 res = vis_faligndata(res, res);
1173
1174 vis_pst_8(res, dp++, mask);
1175 vis_pst_8(res, dp, ~mask);
1176 i += 2;
1177 }
1178
1179 if (i < cols) {
1180 NEXT_PIXEL_4BC_U8();
1181 LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
1182 RESULT_4BC_U8_1PIXEL(0);
1183
1184 d0 = vis_fpadd16(d00, d10);
1185 d1 = vis_fpadd16(d20, d30);
1186 d0 = vis_fpadd16(d0, d1);
1187 res = vis_fpack16_pair(d0, d0);
1188 vis_alignaddr((void *)gsrd, 0);
1189 res = vis_faligndata(res, res);
1190
1191 emask = vis_edge8(dp, dstLineEnd);
1192 vis_pst_8(res, dp++, emask & mask);
1193
1194 if ((mlib_u8*)dp <= (mlib_u8*)dstLineEnd) {
1195 mask = vis_edge8(dp, dstLineEnd);
1196 vis_pst_8(res, dp, mask);
1197 }
1198 }
1199 }
1200
1201 return MLIB_SUCCESS;
1202}
1203
1204/***************************************************************/