blob: 9ec397ebba2ee75ff0dba389bb98355d0a6d17b5 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 1998-2003 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26
27
28/*
29 * FUNCTIONS
30 * mlib_v_ImageChannelInsert_U8
31 * mlib_v_ImageChannelInsert_U8_12_A8D1X8
32 * mlib_v_ImageChannelInsert_U8_12_A8D2X8
33 * mlib_v_ImageChannelInsert_U8_12_D1
34 * mlib_v_ImageChannelInsert_U8_12
35 * mlib_v_ImageChannelInsert_U8_13_A8D1X8
36 * mlib_v_ImageChannelInsert_U8_13_A8D2X8
37 * mlib_v_ImageChannelInsert_U8_13_D1
38 * mlib_v_ImageChannelInsert_U8_13
39 * mlib_v_ImageChannelInsert_U8_14_A8D1X8
40 * mlib_v_ImageChannelInsert_U8_14_A8D2X8
41 * mlib_v_ImageChannelInsert_U8_14_D1
42 * mlib_v_ImageChannelInsert_U8_14
43 * mlib_v_ImageChannelInsert_S16
44 * mlib_v_ImageChannelInsert_S16_12_A8D1X4
45 * mlib_v_ImageChannelInsert_S16_12_A8D2X4
46 * mlib_v_ImageChannelInsert_S16_12_D1
47 * mlib_v_ImageChannelInsert_S16_12
48 * mlib_v_ImageChannelInsert_S16_13_A8D1X4
49 * mlib_v_ImageChannelInsert_S16_13_A8D2X4
50 * mlib_v_ImageChannelInsert_S16_13_D1
51 * mlib_v_ImageChannelInsert_S16_13
52 * mlib_v_ImageChannelInsert_S16_14_A8D1X4
53 * mlib_v_ImageChannelInsert_S16_14_A8D2X4
54 * mlib_v_ImageChannelInsert_S16_14_D1
55 * mlib_v_ImageChannelInsert_S16_14
56 * mlib_v_ImageChannelInsert_S32
57 * mlib_v_ImageChannelInsert_D64
58 *
59 * ARGUMENT
60 * src pointer to source image data
61 * dst pointer to destination image data
62 * slb source image line stride in bytes
63 * dlb destination image line stride in bytes
64 * dsize image data size in pixels
65 * xsize image width in pixels
66 * ysize image height in lines
67 * cmask channel mask
68 *
69 * DESCRIPTION
70 * Copy the 1-channel source image into the selected channel
71 * of the destination image -- VIS version low level functions.
72 *
73 * NOTE
74 * These functions are separated from mlib_v_ImageChannelInsert.c
75 * for loop unrolling and structure clarity.
76 */
77
78#include "vis_proto.h"
79#include "mlib_image.h"
80#include "mlib_v_ImageChannelInsert.h"
81
82/***************************************************************/
83/* general channel insertion: slower due to the inner loop */
84void mlib_v_ImageChannelInsert_U8(const mlib_u8 *src,
85 mlib_s32 slb,
86 mlib_u8 *dst,
87 mlib_s32 dlb,
88 mlib_s32 channels,
89 mlib_s32 channeld,
90 mlib_s32 width,
91 mlib_s32 height,
92 mlib_s32 cmask)
93{
94 mlib_u8 *sp; /* pointer for pixel in src */
95 mlib_u8 *sl; /* pointer for line in src */
96 mlib_u8 *dp; /* pointer for pixel in dst */
97 mlib_u8 *dl; /* pointer for line in dst */
98 mlib_s32 i, j, k; /* indices for x, y, channel */
99 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 };
100 mlib_s32 inc0, inc1, inc2;
101 mlib_u8 s0, s1, s2;
102
103 deltac[channels] = 1;
104 for (i = (channeld - 1), k = 0; i >= 0; i--) {
105 if ((cmask & (1 << i)) == 0)
106 deltac[k]++;
107 else
108 k++;
109 }
110
111 deltac[channels] = channeld;
112 for (i = 1; i < channels; i++) {
113 deltac[channels] -= deltac[i];
114 }
115
116 sp = sl = (void *)src;
117 dp = dl = dst + deltac[0];
118
119 if (channels == 2) {
120 inc0 = deltac[1];
121 inc1 = deltac[2] + inc0;
122 for (j = 0; j < height; j++) {
123#pragma pipeloop(0)
124 for (i = 0; i < width; i++) {
125 s0 = sp[0];
126 s1 = sp[1];
127 dp[0] = s0;
128 dp[inc0] = s1;
129 dp += inc1;
130 sp += 2;
131 }
132
133 sp = sl += slb;
134 dp = dl += dlb;
135 }
136 }
137 else if (channels == 3) {
138 inc0 = deltac[1];
139 inc1 = deltac[2] + inc0;
140 inc2 = deltac[3] + inc1;
141 for (j = 0; j < height; j++) {
142#pragma pipeloop(0)
143 for (i = 0; i < width; i++) {
144 s0 = sp[0];
145 s1 = sp[1];
146 s2 = sp[2];
147 dp[0] = s0;
148 dp[inc0] = s1;
149 dp[inc1] = s2;
150 dp += inc2;
151 sp += 3;
152 }
153
154 sp = sl += slb;
155 dp = dl += dlb;
156 }
157 }
158}
159
160/***************************************************************/
161/* general channel insertion: slower due to the inner loop */
162void mlib_v_ImageChannelInsert_D64(const mlib_d64 *src,
163 mlib_s32 slb,
164 mlib_d64 *dst,
165 mlib_s32 dlb,
166 mlib_s32 channels,
167 mlib_s32 channeld,
168 mlib_s32 width,
169 mlib_s32 height,
170 mlib_s32 cmask)
171{
172 mlib_d64 *sp; /* pointer for pixel in src */
173 mlib_d64 *sl; /* pointer for line in src */
174 mlib_d64 *dp; /* pointer for pixel in dst */
175 mlib_d64 *dl; /* pointer for line in dst */
176 mlib_s32 i, j, k; /* indices for x, y, channel */
177 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 };
178 mlib_s32 inc0, inc1, inc2;
179 mlib_d64 s0, s1, s2;
180
181 deltac[channels] = 1;
182 for (i = (channeld - 1), k = 0; i >= 0; i--) {
183 if ((cmask & (1 << i)) == 0)
184 deltac[k]++;
185 else
186 k++;
187 }
188
189 deltac[channels] = channeld;
190 for (i = 1; i < channels; i++) {
191 deltac[channels] -= deltac[i];
192 }
193
194 sp = sl = (void *)src;
195 dp = dl = dst + deltac[0];
196
197 if (channels == 1) {
198 for (j = 0; j < height; j++) {
199#pragma pipeloop(0)
200 for (i = 0; i < width; i++) {
201 s0 = sp[0];
202 dp[0] = s0;
203 dp += channeld;
204 sp++;
205 }
206
207 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
208 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
209 }
210 }
211 else if (channels == 2) {
212 inc0 = deltac[1];
213 inc1 = deltac[2] + inc0;
214 for (j = 0; j < height; j++) {
215#pragma pipeloop(0)
216 for (i = 0; i < width; i++) {
217 s0 = sp[0];
218 s1 = sp[1];
219 dp[0] = s0;
220 dp[inc0] = s1;
221 dp += inc1;
222 sp += 2;
223 }
224
225 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
226 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
227 }
228 }
229 else if (channels == 3) {
230 inc0 = deltac[1];
231 inc1 = deltac[2] + inc0;
232 inc2 = deltac[3] + inc1;
233 for (j = 0; j < height; j++) {
234#pragma pipeloop(0)
235 for (i = 0; i < width; i++) {
236 s0 = sp[0];
237 s1 = sp[1];
238 s2 = sp[2];
239 dp[0] = s0;
240 dp[inc0] = s1;
241 dp[inc1] = s2;
242 dp += inc2;
243 sp += 3;
244 }
245
246 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
247 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
248 }
249 }
250}
251
252/***************************************************************/
253/* general channel insertion: slower due to the inner loop */
254void mlib_v_ImageChannelInsert_S16(const mlib_s16 *src,
255 mlib_s32 slb,
256 mlib_s16 *dst,
257 mlib_s32 dlb,
258 mlib_s32 channels,
259 mlib_s32 channeld,
260 mlib_s32 width,
261 mlib_s32 height,
262 mlib_s32 cmask)
263{
264 mlib_s16 *sp; /* pointer for pixel in src */
265 mlib_s16 *sl; /* pointer for line in src */
266 mlib_s16 *dp; /* pointer for pixel in dst */
267 mlib_s16 *dl; /* pointer for line in dst */
268 mlib_s32 i, j, k; /* indices for x, y, channel */
269 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 };
270 mlib_s32 inc0, inc1, inc2;
271 mlib_s16 s0, s1, s2;
272
273 deltac[channels] = 1;
274 for (i = (channeld - 1), k = 0; i >= 0; i--) {
275 if ((cmask & (1 << i)) == 0)
276 deltac[k]++;
277 else
278 k++;
279 }
280
281 deltac[channels] = channeld;
282 for (i = 1; i < channels; i++) {
283 deltac[channels] -= deltac[i];
284 }
285
286 sp = sl = (void *)src;
287 dp = dl = dst + deltac[0];
288
289 if (channels == 2) {
290 inc0 = deltac[1];
291 inc1 = deltac[2] + inc0;
292 for (j = 0; j < height; j++) {
293#pragma pipeloop(0)
294 for (i = 0; i < width; i++) {
295 s0 = sp[0];
296 s1 = sp[1];
297 dp[0] = s0;
298 dp[inc0] = s1;
299 dp += inc1;
300 sp += 2;
301 }
302
303 sp = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
304 dp = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
305 }
306 }
307 else if (channels == 3) {
308 inc0 = deltac[1];
309 inc1 = deltac[2] + inc0;
310 inc2 = deltac[3] + inc1;
311 for (j = 0; j < height; j++) {
312#pragma pipeloop(0)
313 for (i = 0; i < width; i++) {
314 s0 = sp[0];
315 s1 = sp[1];
316 s2 = sp[2];
317 dp[0] = s0;
318 dp[inc0] = s1;
319 dp[inc1] = s2;
320 dp += inc2;
321 sp += 3;
322 }
323
324 sp = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
325 dp = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
326 }
327 }
328}
329
330/***************************************************************/
331/* general channel insertion: slower due to the inner loop */
332
333void mlib_v_ImageChannelInsert_S32(const mlib_s32 *src,
334 mlib_s32 slb,
335 mlib_s32 *dst,
336 mlib_s32 dlb,
337 mlib_s32 channels,
338 mlib_s32 channeld,
339 mlib_s32 width,
340 mlib_s32 height,
341 mlib_s32 cmask)
342{
343 mlib_s32 *sp; /* pointer for pixel in src */
344 mlib_s32 *sl; /* pointer for line in src */
345 mlib_s32 *dp; /* pointer for pixel in dst */
346 mlib_s32 *dl; /* pointer for line in dst */
347 mlib_s32 i, j, k; /* indices for x, y, channel */
348 mlib_s32 deltac[5] = { 0, 1, 1, 1, 1 };
349 mlib_s32 inc0, inc1, inc2;
350 mlib_s32 s0, s1, s2;
351
352 deltac[channels] = 1;
353 for (i = (channeld - 1), k = 0; i >= 0; i--) {
354 if ((cmask & (1 << i)) == 0)
355 deltac[k]++;
356 else
357 k++;
358 }
359
360 deltac[channels] = channeld;
361 for (i = 1; i < channels; i++) {
362 deltac[channels] -= deltac[i];
363 }
364
365 sp = sl = (void *)src;
366 dp = dl = dst + deltac[0];
367
368 if (channels == 1) {
369 for (j = 0; j < height; j++) {
370#pragma pipeloop(0)
371 for (i = 0; i < width; i++) {
372 s0 = sp[0];
373 dp[0] = s0;
374 dp += channeld;
375 sp++;
376 }
377
378 sp = sl = (mlib_s32 *) ((mlib_u8 *) sl + slb);
379 dp = dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
380 }
381 }
382 else if (channels == 2) {
383 inc0 = deltac[1];
384 inc1 = deltac[2] + inc0;
385 for (j = 0; j < height; j++) {
386#pragma pipeloop(0)
387 for (i = 0; i < width; i++) {
388 s0 = sp[0];
389 s1 = sp[1];
390 dp[0] = s0;
391 dp[inc0] = s1;
392 dp += inc1;
393 sp += 2;
394 }
395
396 sp = sl = (mlib_s32 *) ((mlib_u8 *) sl + slb);
397 dp = dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
398 }
399 }
400 else if (channels == 3) {
401 inc0 = deltac[1];
402 inc1 = deltac[2] + inc0;
403 inc2 = deltac[3] + inc1;
404 for (j = 0; j < height; j++) {
405#pragma pipeloop(0)
406 for (i = 0; i < width; i++) {
407 s0 = sp[0];
408 s1 = sp[1];
409 s2 = sp[2];
410 dp[0] = s0;
411 dp[inc0] = s1;
412 dp[inc1] = s2;
413 dp += inc2;
414 sp += 3;
415 }
416
417 sp = sl = (mlib_s32 *) ((mlib_u8 *) sl + slb);
418 dp = dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
419 }
420 }
421}
422
423/***************************************************************/
424#define INSERT_U8_12(sd0, dd0, dd1) /* channel duplicate */ \
425 dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
426 dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0))
427
428/***************************************************************/
429/* insert one channel to a 2-channel image.
430 * both source and destination image data are 8-byte aligned.
431 * dsize is multiple of 8.
432 */
433
434void mlib_v_ImageChannelInsert_U8_12_A8D1X8(const mlib_u8 *src,
435 mlib_u8 *dst,
436 mlib_s32 dsize,
437 mlib_s32 cmask)
438{
439 mlib_d64 *sp, *dp;
440 mlib_d64 sd0;
441 mlib_d64 dd0, dd1;
442 mlib_s32 bmask;
443 mlib_s32 i;
444
445 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6);
446
447 sp = (mlib_d64 *) src;
448 dp = (mlib_d64 *) dst;
449
450#pragma pipeloop(0)
451 for (i = 0; i < dsize / 8; i++) {
452 sd0 = *sp++;
453 INSERT_U8_12(sd0, dd0, dd1);
454 vis_pst_8(dd0, dp++, bmask);
455 vis_pst_8(dd1, dp++, bmask);
456 }
457}
458
459/***************************************************************/
460/* insert one channel to a 2-channel image.
461 * both source and destination image data are 8-byte aligned.
462 * xsize is multiple of 8.
463 */
464
465void mlib_v_ImageChannelInsert_U8_12_A8D2X8(const mlib_u8 *src,
466 mlib_s32 slb,
467 mlib_u8 *dst,
468 mlib_s32 dlb,
469 mlib_s32 xsize,
470 mlib_s32 ysize,
471 mlib_s32 cmask)
472{
473 mlib_d64 *sp, *dp;
474 mlib_d64 *sl, *dl;
475 mlib_d64 sd0;
476 mlib_d64 dd0, dd1;
477 mlib_s32 bmask;
478 mlib_s32 i, j;
479
480 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6);
481
482 sp = sl = (mlib_d64 *) src;
483 dp = dl = (mlib_d64 *) dst;
484
485 for (j = 0; j < ysize; j++) {
486#pragma pipeloop(0)
487 for (i = 0; i < xsize / 8; i++) {
488 sd0 = *sp++;
489 INSERT_U8_12(sd0, dd0, dd1);
490 vis_pst_8(dd0, dp++, bmask);
491 vis_pst_8(dd1, dp++, bmask);
492 }
493
494 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
495 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
496 }
497}
498
499/***************************************************************/
500/* insert one channel to a 2-channel image.
501 */
502
503void mlib_v_ImageChannelInsert_U8_12_D1(const mlib_u8 *src,
504 mlib_u8 *dst,
505 mlib_s32 dsize,
506 mlib_s32 cmask)
507{
508 mlib_u8 *sa, *da;
509 mlib_u8 *dend, *dend2; /* end points in dst */
510 mlib_d64 *dp; /* 8-byte aligned start points in dst */
511 mlib_d64 *sp; /* 8-byte aligned start point in src */
512 mlib_d64 sd0, sd1; /* 8-byte source data */
513 mlib_d64 dd0, dd1, dd2, dd3; /* 8-byte destination data */
514 mlib_s32 soff; /* offset of address in src */
515 mlib_s32 doff; /* offset of address in dst */
516 mlib_s32 off; /* offset of src over dst */
517 mlib_s32 emask; /* edge mask */
518 mlib_s32 bmask; /* channel mask */
519 mlib_s32 i, n;
520
521 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6);
522
523 sa = (void *)src;
524 da = dst;
525
526 /* prepare the source address */
527 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
528 soff = ((mlib_addr) sa & 7);
529
530 /* prepare the destination addresses */
531 dp = (mlib_d64 *) ((mlib_addr) da & (~7));
532 doff = ((mlib_addr) da & 7);
533 dend = da + dsize * 2 - 1;
534 dend2 = dend - 15;
535
536 /* calculate the src's offset over dst */
537 off = soff * 2 - doff;
538
539 if (doff % 2 != 0) {
540 bmask = (~bmask) & 0xff;
541 }
542
543 if (off == 0) { /* src and dst have same alignment */
544
545 /* load 8 bytes */
546 sd0 = *sp++;
547
548 /* insert, including some garbage at the start point */
549 INSERT_U8_12(sd0, dd0, dd1);
550
551 /* store 16 bytes result */
552 emask = vis_edge8(da, dend);
553 vis_pst_8(dd0, dp++, emask & bmask);
554 if ((mlib_addr) dp <= (mlib_addr) dend) {
555 emask = vis_edge8(dp, dend);
556 vis_pst_8(dd1, dp++, emask & bmask);
557 }
558
559 if ((mlib_addr) dp <= (mlib_addr) dend2) {
560 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1;
561
562 /* 8-pixel column loop, emask not needed */
563#pragma pipeloop(0)
564 for (i = 0; i < n; i++) {
565 sd0 = *sp++;
566 INSERT_U8_12(sd0, dd0, dd1);
567 vis_pst_8(dd0, dp++, bmask);
568 vis_pst_8(dd1, dp++, bmask);
569 }
570 }
571
572 /* end point handling */
573 if ((mlib_addr) dp <= (mlib_addr) dend) {
574 sd0 = *sp++;
575 INSERT_U8_12(sd0, dd0, dd1);
576 emask = vis_edge8(dp, dend);
577 vis_pst_8(dd0, dp++, emask & bmask);
578 if ((mlib_addr) dp <= (mlib_addr) dend) {
579 emask = vis_edge8(dp, dend);
580 vis_pst_8(dd1, dp++, emask & bmask);
581 }
582 }
583 }
584 else if (off < 0) {
585 vis_alignaddr((void *)0, off);
586
587 /* generate edge mask for the start point */
588 emask = vis_edge8(da, dend);
589
590 /* load 8 bytes */
591 sd0 = *sp++;
592
593 /* insert and store 16 bytes */
594 INSERT_U8_12(sd0, dd0, dd1);
595 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
596 if ((mlib_addr) dp <= (mlib_addr) dend) {
597 emask = vis_edge8(dp, dend);
598 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
599 }
600
601 if ((mlib_addr) dp <= (mlib_addr) dend2) {
602 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1;
603
604 /* 8-pixel column loop, emask not needed */
605#pragma pipeloop(0)
606 for (i = 0; i < n; i++) {
607 dd2 = dd1;
608 sd0 = *sp++;
609 INSERT_U8_12(sd0, dd0, dd1);
610 vis_pst_8(vis_faligndata(dd2, dd0), dp++, bmask);
611 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
612 }
613 }
614
615 /* end point handling */
616 if ((mlib_addr) dp <= (mlib_addr) dend) {
617 emask = vis_edge8(dp, dend);
618 dd2 = dd1;
619 sd0 = *sp++;
620 INSERT_U8_12(sd0, dd0, dd1);
621 vis_pst_8(vis_faligndata(dd2, dd0), dp++, emask & bmask);
622 if ((mlib_addr) dp <= (mlib_addr) dend) {
623 emask = vis_edge8(dp, dend);
624 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
625 }
626 }
627 }
628 else if (off < 8) {
629 vis_alignaddr((void *)0, off);
630
631 /* generate edge mask for the start point */
632 emask = vis_edge8(da, dend);
633
634 /* load 16 bytes */
635 sd0 = *sp++;
636 sd1 = *sp++;
637
638 /* insert and store 16 bytes */
639 INSERT_U8_12(sd0, dd0, dd1);
640 INSERT_U8_12(sd1, dd2, dd3);
641 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
642 if ((mlib_addr) dp <= (mlib_addr) dend) {
643 emask = vis_edge8(dp, dend);
644 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
645 }
646
647 if ((mlib_addr) dp <= (mlib_addr) dend2) {
648 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1;
649
650 /* 8-pixel column loop, emask not needed */
651#pragma pipeloop(0)
652 for (i = 0; i < n; i++) {
653 dd0 = dd2;
654 dd1 = dd3;
655 sd1 = *sp++;
656 INSERT_U8_12(sd1, dd2, dd3);
657 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
658 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
659 }
660 }
661
662 /* end point handling */
663 if ((mlib_addr) dp <= (mlib_addr) dend) {
664 emask = vis_edge8(dp, dend);
665 dd0 = dd2;
666 dd1 = dd3;
667 sd1 = *sp++;
668 INSERT_U8_12(sd1, dd2, dd3);
669 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
670 if ((mlib_addr) dp <= (mlib_addr) dend) {
671 emask = vis_edge8(dp, dend);
672 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
673 }
674 }
675 }
676 else { /* (off >= 8) */
677 vis_alignaddr((void *)0, off);
678
679 /* generate edge mask for the start point */
680 emask = vis_edge8(da, dend);
681
682 /* load 16 bytes */
683 sd0 = *sp++;
684 sd1 = *sp++;
685
686 /* insert and store 16 bytes */
687 INSERT_U8_12(sd0, dd0, dd1);
688 INSERT_U8_12(sd1, dd2, dd3);
689 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
690 if ((mlib_addr) dp <= (mlib_addr) dend) {
691 emask = vis_edge8(dp, dend);
692 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
693 }
694
695 if ((mlib_addr) dp <= (mlib_addr) dend2) {
696 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1;
697
698 /* 8-pixel column loop, emask not needed */
699#pragma pipeloop(0)
700 for (i = 0; i < n; i++) {
701 dd0 = dd2;
702 dd1 = dd3;
703 sd1 = *sp++;
704 INSERT_U8_12(sd1, dd2, dd3);
705 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
706 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
707 }
708 }
709
710 /* end point handling */
711 if ((mlib_addr) dp <= (mlib_addr) dend) {
712 emask = vis_edge8(dp, dend);
713 dd0 = dd2;
714 dd1 = dd3;
715 sd1 = *sp++;
716 INSERT_U8_12(sd1, dd2, dd3);
717 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
718 if ((mlib_addr) dp <= (mlib_addr) dend) {
719 emask = vis_edge8(dp, dend);
720 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
721 }
722 }
723 }
724}
725
726/***************************************************************/
727/* insert one channel to a 2-channel image.
728 */
729
730void mlib_v_ImageChannelInsert_U8_12(const mlib_u8 *src,
731 mlib_s32 slb,
732 mlib_u8 *dst,
733 mlib_s32 dlb,
734 mlib_s32 xsize,
735 mlib_s32 ysize,
736 mlib_s32 cmask)
737{
738 mlib_u8 *sa, *da;
739 mlib_u8 *sl, *dl;
740 mlib_s32 j;
741
742 sa = sl = (void *)src;
743 da = dl = dst;
744
745#pragma pipeloop(0)
746 for (j = 0; j < ysize; j++) {
747 mlib_v_ImageChannelInsert_U8_12_D1(sa, da, xsize, cmask);
748 sa = sl += slb;
749 da = dl += dlb;
750 }
751}
752
753/***************************************************************/
754#define INSERT_U8_13(sd0, dd0, dd1, dd2) \
755 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd0)); \
756 sdb = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sda)); \
757 sdc = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \
758 sdd = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb)); \
759 dd0 = vis_fpmerge(vis_read_hi(sdc), vis_read_hi(sdd)); \
760 sde = vis_fpmerge(vis_read_lo(sdc), vis_read_lo(sdd)); \
761 dd1 = vis_freg_pair(vis_read_lo(dd0), vis_read_hi(sde)); \
762 dd2 = vis_freg_pair(vis_read_lo(sde), vis_read_lo(sde))
763
764/***************************************************************/
765#define LOAD_INSERT_STORE_U8_A8(channeld) \
766 sd = *sp++; \
767 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
768 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
769 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
770 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
771 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
772 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
773 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
774 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld
775
776/***************************************************************/
777#define LOAD_INSERT_STORE_U8(channeld) \
778 vis_alignaddr((void *)0, off); \
779 sd0 = sd1; \
780 sd1 = *sp++; \
781 sd = vis_faligndata(sd0, sd1); \
782 vis_alignaddr((void *)0, 1); \
783 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
784 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
785 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
786 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
787 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
788 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
789 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
790 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld
791
792/***************************************************************/
793void mlib_v_ImageChannelInsert_U8_13_A8D1X8(const mlib_u8 *src,
794 mlib_u8 *dst,
795 mlib_s32 dsize,
796 mlib_s32 cmask)
797{
798 mlib_u8 *da;
799 mlib_d64 *sp;
800 mlib_d64 sd;
801 mlib_s32 i;
802
803 vis_alignaddr((void *)0, 1); /* for 1-byte left shift */
804
805 sp = (mlib_d64 *) src;
806 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
807
808#pragma pipeloop(0)
809 for (i = 0; i < dsize / 8; i++) {
810 LOAD_INSERT_STORE_U8_A8(3);
811 }
812}
813
814/***************************************************************/
815void mlib_v_ImageChannelInsert_U8_13_A8D2X8(const mlib_u8 *src,
816 mlib_s32 slb,
817 mlib_u8 *dst,
818 mlib_s32 dlb,
819 mlib_s32 xsize,
820 mlib_s32 ysize,
821 mlib_s32 cmask)
822{
823 mlib_u8 *da, *dl;
824 mlib_d64 *sp, *sl;
825 mlib_d64 sd;
826 mlib_s32 i, j;
827
828 vis_alignaddr((void *)0, 1);
829
830 sp = sl = (mlib_d64 *) src;
831 da = dl = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
832
833 for (j = 0; j < ysize; j++) {
834#pragma pipeloop(0)
835 for (i = 0; i < xsize / 8; i++) {
836 LOAD_INSERT_STORE_U8_A8(3);
837 }
838
839 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
840 da = dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
841 }
842}
843
844/***************************************************************/
845void mlib_v_ImageChannelInsert_U8_13_D1(const mlib_u8 *src,
846 mlib_u8 *dst,
847 mlib_s32 dsize,
848 mlib_s32 cmask)
849{
850 mlib_u8 *sa, *da;
851 mlib_u8 *dend; /* end point in destination */
852 mlib_d64 *sp; /* 8-byte aligned start points in src */
853 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
854 mlib_s32 off; /* offset of address alignment in src */
855 mlib_s32 i;
856
857 /* prepare the src address */
858 sa = (void *)src;
859 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
860 off = (mlib_addr) sa & 7;
861
862 /* prepare the dst address */
863 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
864 dend = da + dsize * 3 - 1;
865
866 sd1 = *sp++;
867
868#pragma pipeloop(0)
869 for (i = 0; i < dsize / 8; i++) {
870 LOAD_INSERT_STORE_U8(3);
871 }
872
873 /* right end handling */
874 if ((mlib_addr) da <= (mlib_addr) dend) {
875
876 vis_alignaddr((void *)0, off);
877 sd0 = sd1;
878 sd1 = *sp++;
879 sd = vis_faligndata(sd0, sd1);
880
881 vis_alignaddr((void *)0, 1);
882 vis_st_u8(sd = vis_faligndata(sd, sd), da);
883 da += 3;
884 if ((mlib_addr) da <= (mlib_addr) dend) {
885 vis_st_u8(sd = vis_faligndata(sd, sd), da);
886 da += 3;
887 if ((mlib_addr) da <= (mlib_addr) dend) {
888 vis_st_u8(sd = vis_faligndata(sd, sd), da);
889 da += 3;
890 if ((mlib_addr) da <= (mlib_addr) dend) {
891 vis_st_u8(sd = vis_faligndata(sd, sd), da);
892 da += 3;
893 if ((mlib_addr) da <= (mlib_addr) dend) {
894 vis_st_u8(sd = vis_faligndata(sd, sd), da);
895 da += 3;
896 if ((mlib_addr) da <= (mlib_addr) dend) {
897 vis_st_u8(sd = vis_faligndata(sd, sd), da);
898 da += 3;
899 if ((mlib_addr) da <= (mlib_addr) dend) {
900 vis_st_u8(sd = vis_faligndata(sd, sd), da);
901 }
902 }
903 }
904 }
905 }
906 }
907 }
908}
909
910/***************************************************************/
911void mlib_v_ImageChannelInsert_U8_13(const mlib_u8 *src,
912 mlib_s32 slb,
913 mlib_u8 *dst,
914 mlib_s32 dlb,
915 mlib_s32 xsize,
916 mlib_s32 ysize,
917 mlib_s32 cmask)
918{
919 mlib_u8 *sa, *da;
920 mlib_u8 *sl, *dl;
921 mlib_s32 j;
922
923 sa = sl = (void *)src;
924 da = dl = dst;
925
926#pragma pipeloop(0)
927 for (j = 0; j < ysize; j++) {
928 mlib_v_ImageChannelInsert_U8_13_D1(sa, da, xsize, cmask);
929 sa = sl += slb;
930 da = dl += dlb;
931 }
932}
933
934/***************************************************************/
935#define INSERT_U8_14(sd0, dd0, dd1, dd2, dd3) \
936 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
937 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)); \
938 dd0 = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sda)); \
939 dd1 = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sda)); \
940 dd2 = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \
941 dd3 = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb))
942
943/***************************************************************/
944void mlib_v_ImageChannelInsert_U8_14_A8D1X8(const mlib_u8 *src,
945 mlib_u8 *dst,
946 mlib_s32 dsize,
947 mlib_s32 cmask)
948{
949 mlib_d64 *sp, *dp;
950 mlib_d64 sd0;
951 mlib_d64 sda, sdb;
952 mlib_d64 dd0, dd1, dd2, dd3;
953 mlib_s32 bmask;
954 mlib_s32 i;
955
956 bmask = cmask | (cmask << 4);
957
958 sp = (mlib_d64 *) src;
959 dp = (mlib_d64 *) dst;
960
961#pragma pipeloop(0)
962 for (i = 0; i < dsize / 8; i++) {
963 sd0 = *sp++;
964 INSERT_U8_14(sd0, dd0, dd1, dd2, dd3);
965 vis_pst_8(dd0, dp++, bmask);
966 vis_pst_8(dd1, dp++, bmask);
967 vis_pst_8(dd2, dp++, bmask);
968 vis_pst_8(dd3, dp++, bmask);
969 }
970}
971
972/***************************************************************/
973void mlib_v_ImageChannelInsert_U8_14_A8D2X8(const mlib_u8 *src,
974 mlib_s32 slb,
975 mlib_u8 *dst,
976 mlib_s32 dlb,
977 mlib_s32 xsize,
978 mlib_s32 ysize,
979 mlib_s32 cmask)
980{
981 mlib_d64 *sp, *dp;
982 mlib_d64 *sl, *dl;
983 mlib_d64 sd0;
984 mlib_d64 sda, sdb;
985 mlib_d64 dd0, dd1, dd2, dd3;
986 mlib_s32 bmask;
987 mlib_s32 i, j;
988
989 bmask = cmask | (cmask << 4);
990
991 sp = sl = (mlib_d64 *) src;
992 dp = dl = (mlib_d64 *) dst;
993
994 for (j = 0; j < ysize; j++) {
995#pragma pipeloop(0)
996 for (i = 0; i < xsize / 8; i++) {
997 sd0 = *sp++;
998 INSERT_U8_14(sd0, dd0, dd1, dd2, dd3);
999 vis_pst_8(dd0, dp++, bmask);
1000 vis_pst_8(dd1, dp++, bmask);
1001 vis_pst_8(dd2, dp++, bmask);
1002 vis_pst_8(dd3, dp++, bmask);
1003 }
1004
1005 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1006 dp = dl = (mlib_d64 *) ((mlib_u8 *) dl + dlb);
1007 }
1008}
1009
1010/***************************************************************/
1011void mlib_v_ImageChannelInsert_U8_14_D1(const mlib_u8 *src,
1012 mlib_u8 *dst,
1013 mlib_s32 dsize,
1014 mlib_s32 cmask)
1015{
1016 mlib_u8 *sa, *da;
1017 mlib_u8 *dend, *dend2; /* end points in dst */
1018 mlib_d64 *dp; /* 8-byte aligned start points in dst */
1019 mlib_d64 *sp; /* 8-byte aligned start point in src */
1020 mlib_d64 sd0, sd1, sd; /* 8-byte source data */
1021 mlib_d64 sda, sdb;
1022 mlib_d64 dd0, dd1, dd2, dd3, dd4;
1023 mlib_s32 soff; /* offset of address in src */
1024 mlib_s32 doff; /* offset of address in dst */
1025 mlib_s32 emask; /* edge mask */
1026 mlib_s32 bmask; /* channel mask */
1027 mlib_s32 i, n;
1028
1029 sa = (void *)src;
1030 da = dst;
1031
1032 bmask = cmask | (cmask << 4) | (cmask << 8);
1033
1034 /* prepare the source address */
1035 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1036 soff = ((mlib_addr) sa & 7);
1037
1038 /* prepare the destination addresses */
1039 dp = (mlib_d64 *) ((mlib_addr) da & (~7));
1040 doff = ((mlib_addr) da & 7);
1041 dend = da + dsize * 4 - 1;
1042 dend2 = dend - 31;
1043
1044 bmask = (bmask >> (doff % 4)) & 0xff;
1045
1046 if (doff == 0) { /* dst is 8-byte aligned */
1047
1048 vis_alignaddr((void *)0, soff);
1049 sd0 = *sp++;
1050 sd1 = *sp++;
1051 sd = vis_faligndata(sd0, sd1); /* the intermediate is aligned */
1052
1053 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
1054
1055 emask = vis_edge8(da, dend);
1056 vis_pst_8(dd0, dp++, emask & bmask);
1057 if ((mlib_addr) dp <= (mlib_addr) dend) { /* for very small size */
1058 emask = vis_edge8(dp, dend);
1059 vis_pst_8(dd1, dp++, emask & bmask);
1060 if ((mlib_addr) dp <= (mlib_addr) dend) {
1061 emask = vis_edge8(dp, dend);
1062 vis_pst_8(dd2, dp++, emask & bmask);
1063 if ((mlib_addr) dp <= (mlib_addr) dend) {
1064 emask = vis_edge8(dp, dend);
1065 vis_pst_8(dd3, dp++, emask & bmask);
1066 }
1067 }
1068 }
1069
1070 if ((mlib_addr) dp <= (mlib_addr) dend2) {
1071 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 32 + 1;
1072
1073 /* 8-pixel column loop, emask not needed */
1074#pragma pipeloop(0)
1075 for (i = 0; i < n; i++) {
1076 sd0 = sd1;
1077 sd1 = *sp++;
1078 sd = vis_faligndata(sd0, sd1);
1079
1080 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
1081
1082 vis_pst_8(dd0, dp++, bmask);
1083 vis_pst_8(dd1, dp++, bmask);
1084 vis_pst_8(dd2, dp++, bmask);
1085 vis_pst_8(dd3, dp++, bmask);
1086 }
1087 }
1088
1089 /* end point handling */
1090 if ((mlib_addr) dp <= (mlib_addr) dend) {
1091 sd0 = sd1;
1092 sd1 = *sp++;
1093 sd = vis_faligndata(sd0, sd1);
1094
1095 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
1096
1097 emask = vis_edge8(dp, dend);
1098 vis_pst_8(dd0, dp++, emask & bmask);
1099 if ((mlib_addr) dp <= (mlib_addr) dend) {
1100 emask = vis_edge8(dp, dend);
1101 vis_pst_8(dd1, dp++, emask & bmask);
1102 if ((mlib_addr) dp <= (mlib_addr) dend) {
1103 emask = vis_edge8(dp, dend);
1104 vis_pst_8(dd2, dp++, emask & bmask);
1105 if ((mlib_addr) dp <= (mlib_addr) dend) {
1106 emask = vis_edge8(dp, dend);
1107 vis_pst_8(dd3, dp++, emask & bmask);
1108 }
1109 }
1110 }
1111 }
1112 }
1113 else { /* dst is not 8-byte aligned */
1114 vis_alignaddr((void *)0, soff);
1115 sd0 = *sp++;
1116 sd1 = *sp++;
1117 sd = vis_faligndata(sd0, sd1); /* the intermediate is aligned */
1118
1119 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
1120
1121 vis_alignaddr((void *)0, -doff);
1122
1123 emask = vis_edge8(da, dend);
1124 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
1125 if ((mlib_addr) dp <= (mlib_addr) dend) { /* for very small size */
1126 emask = vis_edge8(dp, dend);
1127 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
1128 if ((mlib_addr) dp <= (mlib_addr) dend) {
1129 emask = vis_edge8(dp, dend);
1130 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
1131 if ((mlib_addr) dp <= (mlib_addr) dend) {
1132 emask = vis_edge8(dp, dend);
1133 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
1134 }
1135 }
1136 }
1137
1138 if ((mlib_addr) dp <= (mlib_addr) dend2) {
1139 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 32 + 1;
1140
1141 /* 8-pixel column loop, emask not needed */
1142#pragma pipeloop(0)
1143 for (i = 0; i < n; i++) {
1144 dd4 = dd3;
1145
1146 vis_alignaddr((void *)0, soff);
1147 sd0 = sd1;
1148 sd1 = *sp++;
1149 sd = vis_faligndata(sd0, sd1);
1150
1151 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
1152
1153 vis_alignaddr((void *)0, -doff);
1154 vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask);
1155 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
1156 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
1157 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
1158 }
1159 }
1160
1161 /* end point handling */
1162 if ((mlib_addr) dp <= (mlib_addr) dend) {
1163 dd4 = dd3;
1164
1165 vis_alignaddr((void *)0, soff);
1166 sd0 = sd1;
1167 sd1 = *sp++;
1168 sd = vis_faligndata(sd0, sd1);
1169
1170 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
1171
1172 vis_alignaddr((void *)0, -doff);
1173 emask = vis_edge8(dp, dend);
1174 vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask);
1175 if ((mlib_addr) dp <= (mlib_addr) dend) {
1176 emask = vis_edge8(dp, dend);
1177 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
1178 if ((mlib_addr) dp <= (mlib_addr) dend) {
1179 emask = vis_edge8(dp, dend);
1180 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
1181 if ((mlib_addr) dp <= (mlib_addr) dend) {
1182 emask = vis_edge8(dp, dend);
1183 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
1184 }
1185 }
1186 }
1187 }
1188 }
1189}
1190
1191/***************************************************************/
1192void mlib_v_ImageChannelInsert_U8_14(const mlib_u8 *src,
1193 mlib_s32 slb,
1194 mlib_u8 *dst,
1195 mlib_s32 dlb,
1196 mlib_s32 xsize,
1197 mlib_s32 ysize,
1198 mlib_s32 cmask)
1199{
1200 mlib_u8 *sa, *da;
1201 mlib_u8 *sl, *dl;
1202 mlib_s32 j;
1203
1204 sa = sl = (void *)src;
1205 da = dl = dst;
1206
1207#pragma pipeloop(0)
1208 for (j = 0; j < ysize; j++) {
1209 mlib_v_ImageChannelInsert_U8_14_D1(sa, da, xsize, cmask);
1210 sa = sl += slb;
1211 da = dl += dlb;
1212 }
1213}
1214
1215/***************************************************************/
1216#define LOAD_INSERT_STORE_S16_1X_A8(channeld) \
1217 sd = *sp++; \
1218 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1219 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1220 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1221 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld
1222
1223/***************************************************************/
1224#define LOAD_INSERT_STORE_S16_1X(channeld) \
1225 vis_alignaddr((void *)0, off); \
1226 sd0 = sd1; \
1227 sd1 = *sp++; \
1228 sd = vis_faligndata(sd0, sd1); \
1229 vis_alignaddr((void *)0, 2); \
1230 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1231 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1232 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld; \
1233 vis_st_u16(sd = vis_faligndata(sd, sd), da); da += channeld
1234
1235/***************************************************************/
1236void mlib_v_ImageChannelInsert_S16_12_A8D1X4(const mlib_s16 *src,
1237 mlib_s16 *dst,
1238 mlib_s32 dsize,
1239 mlib_s32 cmask)
1240{
1241 mlib_s16 *da;
1242 mlib_d64 *sp;
1243 mlib_d64 sd;
1244 mlib_s32 i;
1245
1246 sp = (mlib_d64 *) src;
1247 da = dst + (2 - cmask); /* 2,1 -> 0,1 */
1248
1249 vis_alignaddr((void *)0, 2);
1250
1251#pragma pipeloop(0)
1252 for (i = 0; i < dsize / 4; i++) {
1253 LOAD_INSERT_STORE_S16_1X_A8(2);
1254 }
1255}
1256
1257/***************************************************************/
1258void mlib_v_ImageChannelInsert_S16_12_A8D2X4(const mlib_s16 *src,
1259 mlib_s32 slb,
1260 mlib_s16 *dst,
1261 mlib_s32 dlb,
1262 mlib_s32 xsize,
1263 mlib_s32 ysize,
1264 mlib_s32 cmask)
1265{
1266 mlib_s16 *da, *dl;
1267 mlib_d64 *sp, *sl;
1268 mlib_d64 sd;
1269 mlib_s32 i, j;
1270
1271 sp = sl = (mlib_d64 *) src;
1272 da = dl = dst + (2 - cmask); /* 2,1 -> 0,1 */
1273
1274 vis_alignaddr((void *)0, 2);
1275
1276 for (j = 0; j < ysize; j++) {
1277#pragma pipeloop(0)
1278 for (i = 0; i < xsize / 4; i++) {
1279 LOAD_INSERT_STORE_S16_1X_A8(2);
1280 }
1281
1282 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1283 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1284 }
1285}
1286
1287/***************************************************************/
1288void mlib_v_ImageChannelInsert_S16_12_D1(const mlib_s16 *src,
1289 mlib_s16 *dst,
1290 mlib_s32 dsize,
1291 mlib_s32 cmask)
1292{
1293 mlib_s16 *sa, *da;
1294 mlib_s16 *dend; /* end point in destination */
1295 mlib_d64 *sp; /* 8-byte aligned start points in src */
1296 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
1297 mlib_s32 off; /* offset of address alignment in src */
1298 mlib_s32 i;
1299
1300 sa = (void *)src;
1301 da = dst + (2 - cmask); /* 2,1 -> 0,1 */
1302
1303 /* prepare the src address */
1304 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1305 off = (mlib_addr) sa & 7;
1306
1307 dend = da + dsize * 2 - 1;
1308
1309 sd1 = *sp++;
1310
1311#pragma pipeloop(0)
1312 for (i = 0; i < dsize / 4; i++) {
1313 LOAD_INSERT_STORE_S16_1X(2);
1314 }
1315
1316 /* right end handling */
1317 if ((mlib_addr) da <= (mlib_addr) dend) {
1318
1319 vis_alignaddr((void *)0, off);
1320 sd0 = sd1;
1321 sd1 = *sp++;
1322 sd = vis_faligndata(sd0, sd1);
1323
1324 vis_alignaddr((void *)0, 2);
1325 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1326 da += 2;
1327 if ((mlib_addr) da <= (mlib_addr) dend) {
1328 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1329 da += 2;
1330 if ((mlib_addr) da <= (mlib_addr) dend) {
1331 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1332 }
1333 }
1334 }
1335}
1336
1337/***************************************************************/
1338void mlib_v_ImageChannelInsert_S16_12(const mlib_s16 *src,
1339 mlib_s32 slb,
1340 mlib_s16 *dst,
1341 mlib_s32 dlb,
1342 mlib_s32 xsize,
1343 mlib_s32 ysize,
1344 mlib_s32 cmask)
1345{
1346 mlib_s16 *sa, *da;
1347 mlib_s16 *sl, *dl;
1348 mlib_s32 j;
1349
1350 sa = sl = (void *)src;
1351 da = dl = dst;
1352
1353#pragma pipeloop(0)
1354 for (j = 0; j < ysize; j++) {
1355 mlib_v_ImageChannelInsert_S16_12_D1(sa, da, xsize, cmask);
1356 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
1357 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1358 }
1359}
1360
1361/***************************************************************/
1362void mlib_v_ImageChannelInsert_S16_13_A8D1X4(const mlib_s16 *src,
1363 mlib_s16 *dst,
1364 mlib_s32 dsize,
1365 mlib_s32 cmask)
1366{
1367 mlib_s16 *da;
1368 mlib_d64 *sp;
1369 mlib_d64 sd;
1370 mlib_s32 i;
1371
1372 sp = (mlib_d64 *) src;
1373 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
1374
1375 vis_alignaddr((void *)0, 2);
1376
1377#pragma pipeloop(0)
1378 for (i = 0; i < dsize / 4; i++) {
1379 LOAD_INSERT_STORE_S16_1X_A8(3);
1380 }
1381}
1382
1383/***************************************************************/
1384void mlib_v_ImageChannelInsert_S16_13_A8D2X4(const mlib_s16 *src,
1385 mlib_s32 slb,
1386 mlib_s16 *dst,
1387 mlib_s32 dlb,
1388 mlib_s32 xsize,
1389 mlib_s32 ysize,
1390 mlib_s32 cmask)
1391{
1392 mlib_s16 *da, *dl;
1393 mlib_d64 *sp, *sl;
1394 mlib_d64 sd;
1395 mlib_s32 i, j;
1396
1397 sp = sl = (mlib_d64 *) src;
1398 da = dl = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
1399
1400 vis_alignaddr((void *)0, 2);
1401
1402 for (j = 0; j < ysize; j++) {
1403#pragma pipeloop(0)
1404 for (i = 0; i < xsize / 4; i++) {
1405 LOAD_INSERT_STORE_S16_1X_A8(3);
1406 }
1407
1408 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1409 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1410 }
1411}
1412
1413/***************************************************************/
1414void mlib_v_ImageChannelInsert_S16_13_D1(const mlib_s16 *src,
1415 mlib_s16 *dst,
1416 mlib_s32 dsize,
1417 mlib_s32 cmask)
1418{
1419 mlib_s16 *sa, *da;
1420 mlib_s16 *dend; /* end point in destination */
1421 mlib_d64 *sp; /* 8-byte aligned start points in src */
1422 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
1423 mlib_s32 off; /* offset of address alignment in src */
1424 mlib_s32 i;
1425
1426 sa = (void *)src;
1427 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
1428
1429 /* prepare the src address */
1430 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1431 off = (mlib_addr) sa & 7;
1432
1433 dend = da + dsize * 3 - 1;
1434
1435 sd1 = *sp++;
1436
1437#pragma pipeloop(0)
1438 for (i = 0; i < dsize / 4; i++) {
1439 LOAD_INSERT_STORE_S16_1X(3);
1440 }
1441
1442 /* right end handling */
1443 if ((mlib_addr) da <= (mlib_addr) dend) {
1444
1445 vis_alignaddr((void *)0, off);
1446 sd0 = sd1;
1447 sd1 = *sp++;
1448 sd = vis_faligndata(sd0, sd1);
1449
1450 vis_alignaddr((void *)0, 2);
1451 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1452 da += 3;
1453 if ((mlib_addr) da <= (mlib_addr) dend) {
1454 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1455 da += 3;
1456 if ((mlib_addr) da <= (mlib_addr) dend) {
1457 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1458 }
1459 }
1460 }
1461}
1462
1463/***************************************************************/
1464void mlib_v_ImageChannelInsert_S16_13(const mlib_s16 *src,
1465 mlib_s32 slb,
1466 mlib_s16 *dst,
1467 mlib_s32 dlb,
1468 mlib_s32 xsize,
1469 mlib_s32 ysize,
1470 mlib_s32 cmask)
1471{
1472 mlib_s16 *sa, *da;
1473 mlib_s16 *sl, *dl;
1474 mlib_s32 j;
1475
1476 sa = sl = (void *)src;
1477 da = dl = dst;
1478
1479#pragma pipeloop(0)
1480 for (j = 0; j < ysize; j++) {
1481 mlib_v_ImageChannelInsert_S16_13_D1(sa, da, xsize, cmask);
1482 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
1483 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1484 }
1485}
1486
1487/***************************************************************/
1488#define INSERT_S16_14(sp, dp, bmask) /* channel duplicate */ \
1489 /* obsolete: it is slower than the vis_st_u16() version*/ \
1490 sd0 = *sp++; \
1491 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
1492 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)); \
1493 sdc = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sda)); \
1494 sdd = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sda)); \
1495 sde = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \
1496 sdf = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb)); \
1497 dd0 = vis_fpmerge(vis_read_hi(sdc), vis_read_lo(sdc)); \
1498 dd1 = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sdd)); \
1499 dd2 = vis_fpmerge(vis_read_hi(sde), vis_read_lo(sde)); \
1500 dd3 = vis_fpmerge(vis_read_hi(sdf), vis_read_lo(sdf)); \
1501 vis_pst_16(dd0, dp++, bmask); \
1502 vis_pst_16(dd1, dp++, bmask); \
1503 vis_pst_16(dd2, dp++, bmask); \
1504 vis_pst_16(dd3, dp++, bmask)
1505
1506/***************************************************************/
1507void mlib_v_ImageChannelInsert_S16_14_A8D1X4(const mlib_s16 *src,
1508 mlib_s16 *dst,
1509 mlib_s32 dsize,
1510 mlib_s32 cmask)
1511{
1512 mlib_s16 *da;
1513 mlib_d64 *sp;
1514 mlib_d64 sd;
1515 mlib_s32 i;
1516
1517 sp = (mlib_d64 *) src;
1518 da = dst + (6 / cmask + 1) / 2; /* 8,4,2,1 -> 0,1,2,3 */
1519
1520 vis_alignaddr((void *)0, 2);
1521
1522#pragma pipeloop(0)
1523 for (i = 0; i < dsize / 4; i++) {
1524 LOAD_INSERT_STORE_S16_1X_A8(4);
1525 }
1526}
1527
1528/***************************************************************/
1529void mlib_v_ImageChannelInsert_S16_14_A8D2X4(const mlib_s16 *src,
1530 mlib_s32 slb,
1531 mlib_s16 *dst,
1532 mlib_s32 dlb,
1533 mlib_s32 xsize,
1534 mlib_s32 ysize,
1535 mlib_s32 cmask)
1536{
1537 mlib_s16 *da, *dl;
1538 mlib_d64 *sp, *sl;
1539 mlib_d64 sd;
1540 mlib_s32 i, j;
1541
1542 sp = sl = (mlib_d64 *) src;
1543 da = dl = dst + (6 / cmask + 1) / 2; /* 8,4,2,1 -> 0,1,2,3 */
1544
1545 vis_alignaddr((void *)0, 2);
1546
1547 for (j = 0; j < ysize; j++) {
1548#pragma pipeloop(0)
1549 for (i = 0; i < xsize / 4; i++) {
1550 LOAD_INSERT_STORE_S16_1X_A8(4);
1551 }
1552
1553 sp = sl = (mlib_d64 *) ((mlib_u8 *) sl + slb);
1554 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1555 }
1556}
1557
1558/***************************************************************/
1559void mlib_v_ImageChannelInsert_S16_14_D1(const mlib_s16 *src,
1560 mlib_s16 *dst,
1561 mlib_s32 dsize,
1562 mlib_s32 cmask)
1563{
1564 mlib_s16 *sa, *da;
1565 mlib_s16 *dend; /* end point in destination */
1566 mlib_d64 *sp; /* 8-byte aligned start points in src */
1567 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
1568 mlib_s32 off; /* offset of address alignment in src */
1569 mlib_s32 i;
1570
1571 sa = (void *)src;
1572 da = dst + (6 / cmask + 1) / 2; /* 8,4,2,1 -> 0,1,2,3 */
1573
1574 /* prepare the src address */
1575 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1576 off = (mlib_addr) sa & 7;
1577
1578 dend = da + dsize * 4 - 1;
1579
1580 sd1 = *sp++;
1581
1582#pragma pipeloop(0)
1583 for (i = 0; i < dsize / 4; i++) {
1584 LOAD_INSERT_STORE_S16_1X(4);
1585 }
1586
1587 /* right end handling */
1588 if ((mlib_addr) da <= (mlib_addr) dend) {
1589
1590 vis_alignaddr((void *)0, off);
1591 sd0 = sd1;
1592 sd1 = *sp++;
1593 sd = vis_faligndata(sd0, sd1);
1594
1595 vis_alignaddr((void *)0, 2);
1596 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1597 da += 4;
1598 if ((mlib_addr) da <= (mlib_addr) dend) {
1599 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1600 da += 4;
1601 if ((mlib_addr) da <= (mlib_addr) dend) {
1602 vis_st_u16(sd = vis_faligndata(sd, sd), da);
1603 }
1604 }
1605 }
1606}
1607
1608/***************************************************************/
1609void mlib_v_ImageChannelInsert_S16_14(const mlib_s16 *src,
1610 mlib_s32 slb,
1611 mlib_s16 *dst,
1612 mlib_s32 dlb,
1613 mlib_s32 xsize,
1614 mlib_s32 ysize,
1615 mlib_s32 cmask)
1616{
1617 mlib_s16 *sa, *da;
1618 mlib_s16 *sl, *dl;
1619 mlib_s32 j;
1620
1621 sa = sl = (void *)src;
1622 da = dl = dst;
1623
1624#pragma pipeloop(0)
1625 for (j = 0; j < ysize; j++) {
1626 mlib_v_ImageChannelInsert_S16_14_D1(sa, da, xsize, cmask);
1627 sa = sl = (mlib_s16 *) ((mlib_u8 *) sl + slb);
1628 da = dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1629 }
1630}
1631
1632/***************************************************************/