blob: aacdd4fc89902d41f8cc99ca9196b4318141bb18 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 1998-2003 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26
27
28/*
29 * FILENAME: mlib_v_ImageChannelInsert_34.c
30 *
31 * FUNCTIONS
32 * mlib_v_ImageChannelInsert_U8_34R_A8D1X8
33 * mlib_v_ImageChannelInsert_U8_34R_A8D2X8
34 * mlib_v_ImageChannelInsert_U8_34R_D1
35 * mlib_v_ImageChannelInsert_U8_34R
36 * mlib_v_ImageChannelInsert_S16_34R_A8D1X4
37 * mlib_v_ImageChannelInsert_S16_34R_A8D2X4
38 * mlib_v_ImageChannelInsert_S16_34R_D1
39 * mlib_v_ImageChannelInsert_S16_34R
40 * mlib_v_ImageChannelInsert_U8_34L_A8D1X8
41 * mlib_v_ImageChannelInsert_U8_34L_A8D2X8
42 * mlib_v_ImageChannelInsert_U8_34L_D1
43 * mlib_v_ImageChannelInsert_U8_34L
44 * mlib_v_ImageChannelInsert_S16_34L_A8D1X4
45 * mlib_v_ImageChannelInsert_S16_34L_A8D2X4
46 * mlib_v_ImageChannelInsert_S16_34L_D1
47 * mlib_v_ImageChannelInsert_S16_34L
48 *
49 * SYNOPSIS
50 *
51 * ARGUMENT
52 * src pointer to source image data
53 * dst pointer to destination image data
54 * slb source image line stride in bytes
55 * dlb destination image line stride in bytes
56 * dsize image data size in pixels
57 * xsize image width in pixels
58 * ysize image height in lines
59 * cmask channel mask
60 *
61 * DESCRIPTION
62 * Insert a 3-channel image into the right or left 3 channels of
63 * a 4-channel image low level functions.
64 *
65 * BGR => ABGR (34R), or RGB => RGBA (34L)
66 *
67 * NOTE
68 * These functions are separated from mlib_v_ImageChannelInsert.c
69 * for loop unrolling and structure clarity.
70 */
71
72#include <stdlib.h>
73#include "vis_proto.h"
74#include "mlib_image.h"
75
76/***************************************************************/
77#define INSERT_U8_34R \
78 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
79 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
80 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
81 sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
82 sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \
83 sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \
84 sdg = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde)); \
85 sdh = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sdf)); \
86 sdi = vis_fpmerge(vis_read_hi(sde), vis_read_lo(sdf)); \
87 sdj = vis_fpmerge(vis_read_hi(sdg), vis_read_hi(sdi)); \
88 sdk = vis_fpmerge(vis_read_lo(sdg), vis_read_lo(sdi)); \
89 sdl = vis_fpmerge(vis_read_hi(sdh), vis_read_hi(sdh)); \
90 sdm = vis_fpmerge(vis_read_lo(sdh), vis_read_lo(sdh)); \
91 dd0 = vis_fpmerge(vis_read_hi(sdl), vis_read_hi(sdj)); \
92 dd1 = vis_fpmerge(vis_read_lo(sdl), vis_read_lo(sdj)); \
93 dd2 = vis_fpmerge(vis_read_hi(sdm), vis_read_hi(sdk)); \
94 dd3 = vis_fpmerge(vis_read_lo(sdm), vis_read_lo(sdk));
95
96/***************************************************************/
97#define LOAD_INSERT_STORE_U8_34R_A8 \
98 sd0 = *sp++; /* b0g0r0b1g1r1b2g2 */ \
99 sd1 = *sp++; /* r2b3g3r3b4g4r4b5 */ \
100 sd2 = *sp++; /* g5r5b6g6r6b7g7r7 */ \
101 INSERT_U8_34R \
102 vis_pst_8(dd0, dp++, bmask); \
103 vis_pst_8(dd1, dp++, bmask); \
104 vis_pst_8(dd2, dp++, bmask); \
105 vis_pst_8(dd3, dp++, bmask);
106
107/***************************************************************/
108#define LOAD_INSERT_U8_34R \
109 vis_alignaddr((void *)soff, 0); \
110 s0 = s3; \
111 s1 = sp[1]; \
112 s2 = sp[2]; \
113 s3 = sp[3]; \
114 sd0 = vis_faligndata(s0, s1); \
115 sd1 = vis_faligndata(s1, s2); \
116 sd2 = vis_faligndata(s2, s3); \
117 sp += 3; \
118 dd4 = dd3; \
119 INSERT_U8_34R
120
121/***************************************************************/
122/*
123 * Both source and destination image data are 1-d vectors and
124 * 8-byte aligned. And dsize is multiple of 8.
125 */
126
127void
128mlib_v_ImageChannelInsert_U8_34R_A8D1X8(mlib_u8 *src,
129 mlib_u8 *dst,
130 mlib_s32 dsize)
131{
132 mlib_d64 *sp, *dp;
133 mlib_d64 sd0, sd1, sd2; /* source data */
134 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
135 mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */
136 mlib_d64 sde, sdf, sdg, sdh;
137 mlib_d64 sdi, sdj, sdk, sdl;
138 mlib_d64 sdm;
139 int bmask = 0x77;
140 int i;
141
142 sp = (mlib_d64 *)src;
143 dp = (mlib_d64 *)dst;
144
145#pragma pipeloop(0)
146 for (i = 0; i < dsize / 8; i++) {
147 LOAD_INSERT_STORE_U8_34R_A8;
148 }
149}
150
151/***************************************************************/
152/*
153 * Either source or destination image data are not 1-d vectors, but
154 * they are 8-byte aligned. And slb and dlb are multiple of 8.
155 * The xsize is multiple of 8.
156 */
157
158void
159mlib_v_ImageChannelInsert_U8_34R_A8D2X8(mlib_u8 *src, mlib_s32 slb,
160 mlib_u8 *dst, mlib_s32 dlb,
161 mlib_s32 xsize, mlib_s32 ysize)
162{
163 mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */
164 mlib_d64 *sl, *dl; /* 8-byte aligned pointer for line */
165 mlib_d64 sd0, sd1, sd2; /* source data */
166 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
167 mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */
168 mlib_d64 sde, sdf, sdg, sdh;
169 mlib_d64 sdi, sdj, sdk, sdl;
170 mlib_d64 sdm;
171 int bmask = 0x77;
172 int i, j; /* indices for x, y */
173
174 sp = sl = (mlib_d64 *)src;
175 dp = dl = (mlib_d64 *)dst;
176
177 /* row loop */
178 for (j = 0; j < ysize; j++) {
179 /* 8-byte column loop */
180#pragma pipeloop(0)
181 for (i = 0; i < xsize / 8; i++) {
182 LOAD_INSERT_STORE_U8_34R_A8;
183 }
184 sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb);
185 dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb);
186 }
187}
188
189/***************************************************************/
190/*
191 * either source or destination data are not 8-byte aligned.
192 */
193
194void
195mlib_v_ImageChannelInsert_U8_34R_D1(mlib_u8 *src,
196 mlib_u8 *dst,
197 mlib_s32 dsize)
198{
199 mlib_u8 *sa, *da;
200 mlib_u8 *dend, *dend2; /* end points in dst */
201 mlib_d64 *dp; /* 8-byte aligned start points in dst */
202 mlib_d64 *sp; /* 8-byte aligned start point in src */
203 mlib_d64 s0, s1, s2, s3; /* 8-byte source raw data */
204 mlib_d64 sd0, sd1, sd2; /* 8-byte source data */
205 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
206 mlib_d64 dd4; /* the last datum of the last step */
207 mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */
208 mlib_d64 sde, sdf, sdg, sdh;
209 mlib_d64 sdi, sdj, sdk, sdl;
210 mlib_d64 sdm;
211 int soff; /* offset of address in src */
212 int doff; /* offset of address in dst */
213 int emask; /* edge mask */
214 int bmask; /* channel mask */
215 int i, n;
216
217 sa = src;
218 da = dst;
219
220 /* prepare the source address */
221 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
222 soff = ((mlib_addr) sa & 7);
223
224 /* prepare the destination addresses */
225 dp = (mlib_d64 *)((mlib_addr) da & (~7));
226 dend = da + dsize * 4 - 1;
227 dend2 = dend - 31;
228 doff = ((mlib_addr) da & 7);
229
230 /* set band mask for vis_pst_8 to store the bytes needed */
231 bmask = 0xff & (0x7777 >> doff) ;
232
233 /* generate edge mask for the start point */
234 emask = vis_edge8(da, dend);
235
236 /* load 24 bytes, convert to 32 bytes */
237 s3 = sp[0]; /* initial value */
238 LOAD_INSERT_U8_34R;
239
240 if (doff == 0) { /* dst is 8-byte aligned */
241
242 if (dsize >= 8 ) {
243 vis_pst_8(dd0, dp++, emask & bmask);
244 vis_pst_8(dd1, dp++, bmask);
245 vis_pst_8(dd2, dp++, bmask);
246 vis_pst_8(dd3, dp++, bmask);
247 }
248 else { /* for very small size */
249 vis_pst_8(dd0, dp++, emask & bmask);
250 if ((mlib_addr) dp <= (mlib_addr) dend) {
251 emask = vis_edge8(dp, dend);
252 vis_pst_8(dd1, dp++, emask & bmask);
253 if ((mlib_addr) dp <= (mlib_addr) dend) {
254 emask = vis_edge8(dp, dend);
255 vis_pst_8(dd2, dp++, emask & bmask);
256 if ((mlib_addr) dp <= (mlib_addr) dend) {
257 emask = vis_edge8(dp, dend);
258 vis_pst_8(dd3, dp++, emask & bmask);
259 }
260 }
261 }
262 }
263
264 /* no edge handling is needed in the loop */
265 if ((mlib_addr) dp <= (mlib_addr) dend2) {
266 n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
267#pragma pipeloop(0)
268 for (i = 0; i < n; i++) {
269 LOAD_INSERT_U8_34R;
270 vis_pst_8(dd0, dp++, bmask);
271 vis_pst_8(dd1, dp++, bmask);
272 vis_pst_8(dd2, dp++, bmask);
273 vis_pst_8(dd3, dp++, bmask);
274 }
275 }
276
277 if ((mlib_addr) dp <= (mlib_addr) dend) {
278 LOAD_INSERT_U8_34R;
279 emask = vis_edge8(dp, dend);
280 vis_pst_8(dd0, dp++, emask & bmask);
281 if ((mlib_addr) dp <= (mlib_addr) dend) {
282 emask = vis_edge8(dp, dend);
283 vis_pst_8(dd1, dp++, emask & bmask);
284 if ((mlib_addr) dp <= (mlib_addr) dend) {
285 emask = vis_edge8(dp, dend);
286 vis_pst_8(dd2, dp++, emask & bmask);
287 if ((mlib_addr) dp <= (mlib_addr) dend) {
288 emask = vis_edge8(dp, dend);
289 vis_pst_8(dd3, dp++, emask & bmask);
290 }
291 }
292 }
293 }
294 }
295 else { /* (doff != 0) */
296 vis_alignaddr((void *)0, -doff);
297
298 if (dsize >= 8 ) {
299 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
300 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
301 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
302 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
303 }
304 else { /* for very small size */
305 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
306 if ((mlib_addr) dp <= (mlib_addr) dend) {
307 emask = vis_edge8(dp, dend);
308 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
309 if ((mlib_addr) dp <= (mlib_addr) dend) {
310 emask = vis_edge8(dp, dend);
311 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
312 if ((mlib_addr) dp <= (mlib_addr) dend) {
313 emask = vis_edge8(dp, dend);
314 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
315 if ((mlib_addr) dp <= (mlib_addr) dend) {
316 emask = vis_edge8(dp, dend);
317 vis_pst_8(vis_faligndata(dd3, dd3), dp++, emask & bmask);
318 }
319 }
320 }
321 }
322 }
323
324 /* no edge handling is needed in the loop */
325 if ((mlib_addr) dp <= (mlib_addr) dend2) {
326 n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
327#pragma pipeloop(0)
328 for (i = 0; i < n; i++) {
329 LOAD_INSERT_U8_34R;
330 vis_alignaddr((void *)0, -doff);
331 vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask);
332 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
333 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
334 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
335 }
336 }
337
338 if ((mlib_addr) dp <= (mlib_addr) dend) {
339 LOAD_INSERT_U8_34R;
340 vis_alignaddr((void *)0, -doff);
341 emask = vis_edge8(dp, dend);
342 vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask);
343 if ((mlib_addr) dp <= (mlib_addr) dend) {
344 emask = vis_edge8(dp, dend);
345 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
346 if ((mlib_addr) dp <= (mlib_addr) dend) {
347 emask = vis_edge8(dp, dend);
348 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
349 if ((mlib_addr) dp <= (mlib_addr) dend) {
350 emask = vis_edge8(dp, dend);
351 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
352 }
353 }
354 }
355 }
356 }
357}
358
359/***************************************************************/
360
361void
362mlib_v_ImageChannelInsert_U8_34R(mlib_u8 *src, mlib_s32 slb,
363 mlib_u8 *dst, mlib_s32 dlb,
364 mlib_s32 xsize, mlib_s32 ysize)
365{
366 mlib_u8 *sa, *da;
367 mlib_u8 *sl, *dl;
368 int j;
369
370 sa = sl = src;
371 da = dl = dst;
372
373#pragma pipeloop(0)
374 for (j = 0; j < ysize; j++) {
375 mlib_v_ImageChannelInsert_U8_34R_D1(sa, da, xsize);
376 sa = sl += slb;
377 da = dl += dlb;
378 }
379}
380
381/***************************************************************/
382#define INSERT_S16_34R \
383 vis_alignaddr((void *)0, 6); \
384 dd0 = vis_faligndata(sd0, sd0); /* b1b0g0r0 */ \
385 vis_alignaddr((void *)0, 4); \
386 dd1 = vis_faligndata(sd0, sd1); /* r0b1gbr1 */ \
387 vis_alignaddr((void *)0, 2); \
388 dd2 = vis_faligndata(sd1, sd2); /* r1b2g2r2 */ \
389 dd3 = sd2; /* r2b3g3r3 */
390
391/***************************************************************/
392#define LOAD_INSERT_STORE_S16_34R_A8 \
393 sd0 = *sp++; /* b0g0r0b1 */ \
394 sd1 = *sp++; /* g1r1b2g2 */ \
395 sd2 = *sp++; /* r2b3g3r3 */ \
396 INSERT_S16_34R \
397 vis_pst_16(dd0, dp++, bmask); \
398 vis_pst_16(dd1, dp++, bmask); \
399 vis_pst_16(dd2, dp++, bmask); \
400 vis_pst_16(dd3, dp++, bmask);
401
402/***************************************************************/
403#define LOAD_INSERT_S16_34R \
404 vis_alignaddr((void *)soff, 0); \
405 s0 = s3; \
406 s1 = sp[1]; \
407 s2 = sp[2]; \
408 s3 = sp[3]; \
409 sd0 = vis_faligndata(s0, s1); \
410 sd1 = vis_faligndata(s1, s2); \
411 sd2 = vis_faligndata(s2, s3); \
412 sp += 3; \
413 dd4 = dd3; \
414 INSERT_S16_34R
415
416/***************************************************************/
417/*
418 * both source and destination image data are 1-d vectors and
419 * 8-byte aligned. dsize is multiple of 4.
420 */
421
422void
423mlib_v_ImageChannelInsert_S16_34R_A8D1X4(mlib_s16 *src,
424 mlib_s16 *dst,
425 mlib_s32 dsize)
426{
427 mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */
428 mlib_d64 sd0, sd1, sd2; /* source data */
429 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
430 int bmask = 0x07; /* channel mask */
431 int i;
432
433 sp = (mlib_d64 *)src;
434 dp = (mlib_d64 *)dst;
435
436 /* set GSR.offset for vis_faligndata() */
437 /* vis_alignaddr((void *)0, 2); */ /* only for _old */
438
439#pragma pipeloop(0)
440 for (i = 0; i < dsize / 4; i++) {
441 LOAD_INSERT_STORE_S16_34R_A8;
442 }
443}
444
445/***************************************************************/
446/*
447 * either source or destination image data are not 1-d vectors, but
448 * they are 8-byte aligned. xsize is multiple of 4.
449 */
450
451void
452mlib_v_ImageChannelInsert_S16_34R_A8D2X4(mlib_s16 *src, mlib_s32 slb,
453 mlib_s16 *dst, mlib_s32 dlb,
454 mlib_s32 xsize, mlib_s32 ysize)
455{
456 mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */
457 mlib_d64 *sl, *dl; /* 8-byte aligned pointer for line */
458 mlib_d64 sd0, sd1, sd2; /* source data */
459 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
460 int bmask = 0x07; /* channel mask */
461 int i, j; /* indices for x, y */
462
463 sp = sl = (mlib_d64 *)src;
464 dp = dl = (mlib_d64 *)dst;
465
466 /* row loop */
467 for (j = 0; j < ysize; j++) {
468 /* 4-pixel column loop */
469#pragma pipeloop(0)
470 for (i = 0; i < xsize / 4; i++) {
471 LOAD_INSERT_STORE_S16_34R_A8;
472 }
473 sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb);
474 dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb);
475 }
476}
477
478/***************************************************************/
479/*
480 * either source or destination data are not 8-byte aligned.
481 */
482
483void
484mlib_v_ImageChannelInsert_S16_34R_D1(mlib_s16 *src,
485 mlib_s16 *dst,
486 mlib_s32 dsize)
487{
488 mlib_s16 *sa, *da; /* pointer for pixel */
489 mlib_s16 *dend, *dend2; /* end points in dst */
490 mlib_d64 *dp; /* 8-byte aligned start points in dst */
491 mlib_d64 *sp; /* 8-byte aligned start point in src */
492 mlib_d64 s0, s1, s2, s3; /* 8-byte source raw data */
493 mlib_d64 sd0, sd1, sd2; /* 8-byte source data */
494 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
495 mlib_d64 dd4; /* the last datum of the last step */
496 int soff; /* offset of address in src */
497 int doff; /* offset of address in dst */
498 int emask; /* edge mask */
499 int bmask; /* channel mask */
500 int i, n;
501
502 sa = src;
503 da = dst;
504
505 /* prepare the source address */
506 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
507 soff = ((mlib_addr) sa & 7);
508
509 /* prepare the destination addresses */
510 dp = (mlib_d64 *)((mlib_addr) da & (~7));
511 dend = da + dsize * 4 - 1;
512 dend2 = dend - 15;
513 doff = ((mlib_addr) da & 7);
514
515 /* set channel mask for vis_pst_16 to store the words needed */
516 bmask = 0xff & (0x77 >> (doff / 2));
517
518 /* generate edge mask for the start point */
519 emask = vis_edge16(da, dend);
520
521 /* load 24 byte, convert, store 32 bytes */
522 s3 = sp[0]; /* initial value */
523 LOAD_INSERT_S16_34R;
524
525 if (doff == 0) { /* dst is 8-byte aligned */
526
527 if (dsize >= 4 ) {
528 vis_pst_16(dd0, dp++, emask & bmask);
529 vis_pst_16(dd1, dp++, bmask);
530 vis_pst_16(dd2, dp++, bmask);
531 vis_pst_16(dd3, dp++, bmask);
532 }
533 else { /* for very small size */
534 vis_pst_16(dd0, dp++, emask & bmask);
535 if ((mlib_addr) dp <= (mlib_addr) dend) {
536 emask = vis_edge16(dp, dend);
537 vis_pst_16(dd1, dp++, emask & bmask);
538 if ((mlib_addr) dp <= (mlib_addr) dend) {
539 emask = vis_edge16(dp, dend);
540 vis_pst_16(dd2, dp++, emask & bmask);
541 }
542 }
543 }
544
545 /* no edge handling is needed in the loop */
546 if ((mlib_addr) dp <= (mlib_addr) dend2) {
547 n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
548#pragma pipeloop(0)
549 for (i = 0; i < n; i++) {
550 LOAD_INSERT_S16_34R;
551 vis_pst_16(dd0, dp++, bmask);
552 vis_pst_16(dd1, dp++, bmask);
553 vis_pst_16(dd2, dp++, bmask);
554 vis_pst_16(dd3, dp++, bmask);
555 }
556 }
557
558 if ((mlib_addr) dp <= (mlib_addr) dend) {
559 LOAD_INSERT_S16_34R;
560 emask = vis_edge16(dp, dend);
561 vis_pst_16(dd0, dp++, emask & bmask);
562 if ((mlib_addr) dp <= (mlib_addr) dend) {
563 emask = vis_edge16(dp, dend);
564 vis_pst_16(dd1, dp++, emask & bmask);
565 if ((mlib_addr) dp <= (mlib_addr) dend) {
566 emask = vis_edge16(dp, dend);
567 vis_pst_16(dd2, dp++, emask & bmask);
568 }
569 }
570 }
571 }
572 else { /* (doff != 0) */
573 vis_alignaddr((void *)0, -doff);
574
575 if (dsize >= 4 ) {
576 vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask);
577 vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask);
578 vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask);
579 vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask);
580 }
581 else { /* for very small size */
582 vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask);
583 if ((mlib_addr) dp <= (mlib_addr) dend) {
584 emask = vis_edge16(dp, dend);
585 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask);
586 if ((mlib_addr) dp <= (mlib_addr) dend) {
587 emask = vis_edge16(dp, dend);
588 vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask);
589 if ((mlib_addr) dp <= (mlib_addr) dend) {
590 emask = vis_edge16(dp, dend);
591 vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask);
592 }
593 }
594 }
595 }
596
597 /* no edge handling is needed in the loop */
598 if ((mlib_addr) dp <= (mlib_addr) dend2) {
599 n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
600#pragma pipeloop(0)
601 for (i = 0; i < n; i++) {
602 LOAD_INSERT_S16_34R;
603 vis_alignaddr((void *)0, -doff);
604 vis_pst_16(vis_faligndata(dd4, dd0), dp++, bmask);
605 vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask);
606 vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask);
607 vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask);
608 }
609 }
610
611 if ((mlib_addr) dp <= (mlib_addr) dend) {
612 LOAD_INSERT_S16_34R;
613 vis_alignaddr((void *)0, -doff);
614 emask = vis_edge16(dp, dend);
615 vis_pst_16(vis_faligndata(dd4, dd0), dp++, emask & bmask);
616 if ((mlib_addr) dp <= (mlib_addr) dend) {
617 emask = vis_edge16(dp, dend);
618 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask);
619 if ((mlib_addr) dp <= (mlib_addr) dend) {
620 emask = vis_edge16(dp, dend);
621 vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask);
622 if ((mlib_addr) dp <= (mlib_addr) dend) {
623 emask = vis_edge16(dp, dend);
624 vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask);
625 }
626 }
627 }
628 }
629 }
630}
631
632/***************************************************************/
633
634void
635mlib_v_ImageChannelInsert_S16_34R(mlib_s16 *src, mlib_s32 slb,
636 mlib_s16 *dst, mlib_s32 dlb,
637 mlib_s32 xsize, mlib_s32 ysize)
638{
639 mlib_s16 *sa, *da;
640 mlib_s16 *sl, *dl;
641 int j;
642
643 sa = sl = src;
644 da = dl = dst;
645
646#pragma pipeloop(0)
647 for (j = 0; j < ysize; j++) {
648 mlib_v_ImageChannelInsert_S16_34R_D1(sa, da, xsize);
649 sa = sl = (mlib_s16 *)((mlib_u8 *)sl + slb);
650 da = dl = (mlib_s16 *)((mlib_u8 *)dl + dlb);
651 }
652}
653
654/***************************************************************/
655#define INSERT_U8_34L \
656 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); \
657 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); \
658 sdc = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); \
659 sdd = vis_fpmerge(vis_read_hi(sda), vis_read_lo(sdb)); \
660 sde = vis_fpmerge(vis_read_lo(sda), vis_read_hi(sdc)); \
661 sdf = vis_fpmerge(vis_read_hi(sdb), vis_read_lo(sdc)); \
662 sdg = vis_fpmerge(vis_read_hi(sdd), vis_read_lo(sde)); \
663 sdh = vis_fpmerge(vis_read_lo(sdd), vis_read_hi(sdf)); \
664 sdi = vis_fpmerge(vis_read_hi(sde), vis_read_lo(sdf)); \
665 sdj = vis_fpmerge(vis_read_hi(sdg), vis_read_hi(sdi)); \
666 sdk = vis_fpmerge(vis_read_lo(sdg), vis_read_lo(sdi)); \
667 sdl = vis_fpmerge(vis_read_hi(sdh), vis_read_hi(sdh)); \
668 sdm = vis_fpmerge(vis_read_lo(sdh), vis_read_lo(sdh)); \
669 dd0 = vis_fpmerge(vis_read_hi(sdj), vis_read_hi(sdl)); \
670 dd1 = vis_fpmerge(vis_read_lo(sdj), vis_read_lo(sdl)); \
671 dd2 = vis_fpmerge(vis_read_hi(sdk), vis_read_hi(sdm)); \
672 dd3 = vis_fpmerge(vis_read_lo(sdk), vis_read_lo(sdm));
673
674/***************************************************************/
675#define LOAD_INSERT_STORE_U8_34L_A8 \
676 sd0 = *sp++; /* b0g0r0b1g1r1b2g2 */ \
677 sd1 = *sp++; /* r2b3g3r3b4g4r4b5 */ \
678 sd2 = *sp++; /* g5r5b6g6r6b7g7r7 */ \
679 INSERT_U8_34L \
680 vis_pst_8(dd0, dp++, bmask); \
681 vis_pst_8(dd1, dp++, bmask); \
682 vis_pst_8(dd2, dp++, bmask); \
683 vis_pst_8(dd3, dp++, bmask);
684
685/***************************************************************/
686#define LOAD_INSERT_U8_34L \
687 vis_alignaddr((void *)soff, 0); \
688 s0 = s3; \
689 s1 = sp[1]; \
690 s2 = sp[2]; \
691 s3 = sp[3]; \
692 sd0 = vis_faligndata(s0, s1); \
693 sd1 = vis_faligndata(s1, s2); \
694 sd2 = vis_faligndata(s2, s3); \
695 sp += 3; \
696 dd4 = dd3; \
697 INSERT_U8_34L
698
699/***************************************************************/
700/*
701 * Both source and destination image data are 1-d vectors and
702 * 8-byte aligned. And dsize is multiple of 8.
703 */
704void
705mlib_v_ImageChannelInsert_U8_34L_A8D1X8(mlib_u8 *src,
706 mlib_u8 *dst,
707 mlib_s32 dsize)
708{
709 mlib_d64 *sp, *dp;
710 mlib_d64 sd0, sd1, sd2; /* source data */
711 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
712 mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */
713 mlib_d64 sde, sdf, sdg, sdh;
714 mlib_d64 sdi, sdj, sdk, sdl;
715 mlib_d64 sdm;
716 int bmask = 0xee;
717 int i;
718
719 sp = (mlib_d64 *)src;
720 dp = (mlib_d64 *)dst;
721
722#pragma pipeloop(0)
723 for (i = 0; i < dsize / 8; i++) {
724 LOAD_INSERT_STORE_U8_34L_A8;
725 }
726}
727
728/***************************************************************/
729/*
730 * Either source or destination image data are not 1-d vectors, but
731 * they are 8-byte aligned. And slb and dlb are multiple of 8.
732 * The xsize is multiple of 8.
733 */
734void
735mlib_v_ImageChannelInsert_U8_34L_A8D2X8(mlib_u8 *src, mlib_s32 slb,
736 mlib_u8 *dst, mlib_s32 dlb,
737 mlib_s32 xsize, mlib_s32 ysize)
738{
739 mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */
740 mlib_d64 *sl, *dl; /* 8-byte aligned pointer for line */
741 mlib_d64 sd0, sd1, sd2; /* source data */
742 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
743 mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */
744 mlib_d64 sde, sdf, sdg, sdh;
745 mlib_d64 sdi, sdj, sdk, sdl;
746 mlib_d64 sdm;
747 int bmask = 0xee;
748 int i, j; /* indices for x, y */
749
750 sp = sl = (mlib_d64 *)src;
751 dp = dl = (mlib_d64 *)dst;
752
753 /* row loop */
754 for (j = 0; j < ysize; j++) {
755 /* 8-byte column loop */
756#pragma pipeloop(0)
757 for (i = 0; i < xsize / 8; i++) {
758 LOAD_INSERT_STORE_U8_34L_A8;
759 }
760 sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb);
761 dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb);
762 }
763}
764
765/***************************************************************/
766/*
767 * either source or destination data are not 8-byte aligned.
768 */
769void
770mlib_v_ImageChannelInsert_U8_34L_D1(mlib_u8 *src,
771 mlib_u8 *dst,
772 mlib_s32 dsize)
773{
774 mlib_u8 *sa, *da;
775 mlib_u8 *dend, *dend2; /* end points in dst */
776 mlib_d64 *dp; /* 8-byte aligned start points in dst */
777 mlib_d64 *sp; /* 8-byte aligned start point in src */
778 mlib_d64 s0, s1, s2, s3; /* 8-byte source raw data */
779 mlib_d64 sd0, sd1, sd2; /* 8-byte source data */
780 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
781 mlib_d64 dd4; /* the last datum of the last step */
782 mlib_d64 sda, sdb, sdc, sdd; /* intermediate variables */
783 mlib_d64 sde, sdf, sdg, sdh;
784 mlib_d64 sdi, sdj, sdk, sdl;
785 mlib_d64 sdm;
786 int soff; /* offset of address in src */
787 int doff; /* offset of address in dst */
788 int emask; /* edge mask */
789 int bmask; /* channel mask */
790 int i, n;
791
792 sa = src;
793 da = dst;
794
795 /* prepare the source address */
796 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
797 soff = ((mlib_addr) sa & 7);
798
799 /* prepare the destination addresses */
800 dp = (mlib_d64 *)((mlib_addr) da & (~7));
801 dend = da + dsize * 4 - 1;
802 dend2 = dend - 31;
803 doff = ((mlib_addr) da & 7);
804
805 /* set band mask for vis_pst_8 to store the bytes needed */
806 bmask = 0xff & (0xeeee >> doff) ;
807
808 /* generate edge mask for the start point */
809 emask = vis_edge8(da, dend);
810
811 /* load 24 bytes, convert to 32 bytes */
812 s3 = sp[0]; /* initial value */
813 LOAD_INSERT_U8_34L;
814
815 if (doff == 0) { /* dst is 8-byte aligned */
816
817 if (dsize >= 8 ) {
818 vis_pst_8(dd0, dp++, emask & bmask);
819 vis_pst_8(dd1, dp++, bmask);
820 vis_pst_8(dd2, dp++, bmask);
821 vis_pst_8(dd3, dp++, bmask);
822 }
823 else { /* for very small size */
824 vis_pst_8(dd0, dp++, emask & bmask);
825 if ((mlib_addr) dp <= (mlib_addr) dend) {
826 emask = vis_edge8(dp, dend);
827 vis_pst_8(dd1, dp++, emask & bmask);
828 if ((mlib_addr) dp <= (mlib_addr) dend) {
829 emask = vis_edge8(dp, dend);
830 vis_pst_8(dd2, dp++, emask & bmask);
831 if ((mlib_addr) dp <= (mlib_addr) dend) {
832 emask = vis_edge8(dp, dend);
833 vis_pst_8(dd3, dp++, emask & bmask);
834 }
835 }
836 }
837 }
838
839 /* no edge handling is needed in the loop */
840 if ((mlib_addr) dp <= (mlib_addr) dend2) {
841 n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
842#pragma pipeloop(0)
843 for (i = 0; i < n; i++) {
844 LOAD_INSERT_U8_34L;
845 vis_pst_8(dd0, dp++, bmask);
846 vis_pst_8(dd1, dp++, bmask);
847 vis_pst_8(dd2, dp++, bmask);
848 vis_pst_8(dd3, dp++, bmask);
849 }
850 }
851
852 if ((mlib_addr) dp <= (mlib_addr) dend) {
853 LOAD_INSERT_U8_34L;
854 emask = vis_edge8(dp, dend);
855 vis_pst_8(dd0, dp++, emask & bmask);
856 if ((mlib_addr) dp <= (mlib_addr) dend) {
857 emask = vis_edge8(dp, dend);
858 vis_pst_8(dd1, dp++, emask & bmask);
859 if ((mlib_addr) dp <= (mlib_addr) dend) {
860 emask = vis_edge8(dp, dend);
861 vis_pst_8(dd2, dp++, emask & bmask);
862 if ((mlib_addr) dp <= (mlib_addr) dend) {
863 emask = vis_edge8(dp, dend);
864 vis_pst_8(dd3, dp++, emask & bmask);
865 }
866 }
867 }
868 }
869 }
870 else { /* (doff != 0) */
871 vis_alignaddr((void *)0, -doff);
872
873 if (dsize >= 8 ) {
874 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
875 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
876 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
877 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
878 }
879 else { /* for very small size */
880 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
881 if ((mlib_addr) dp <= (mlib_addr) dend) {
882 emask = vis_edge8(dp, dend);
883 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
884 if ((mlib_addr) dp <= (mlib_addr) dend) {
885 emask = vis_edge8(dp, dend);
886 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
887 if ((mlib_addr) dp <= (mlib_addr) dend) {
888 emask = vis_edge8(dp, dend);
889 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
890 if ((mlib_addr) dp <= (mlib_addr) dend) {
891 emask = vis_edge8(dp, dend);
892 vis_pst_8(vis_faligndata(dd3, dd3), dp++, emask & bmask);
893 }
894 }
895 }
896 }
897 }
898
899 /* no edge handling is needed in the loop */
900 if ((mlib_addr) dp <= (mlib_addr) dend2) {
901 n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
902#pragma pipeloop(0)
903 for (i = 0; i < n; i++) {
904 LOAD_INSERT_U8_34L;
905 vis_alignaddr((void *)0, -doff);
906 vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask);
907 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
908 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
909 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
910 }
911 }
912
913 if ((mlib_addr) dp <= (mlib_addr) dend) {
914 LOAD_INSERT_U8_34L;
915 vis_alignaddr((void *)0, -doff);
916 emask = vis_edge8(dp, dend);
917 vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask);
918 if ((mlib_addr) dp <= (mlib_addr) dend) {
919 emask = vis_edge8(dp, dend);
920 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
921 if ((mlib_addr) dp <= (mlib_addr) dend) {
922 emask = vis_edge8(dp, dend);
923 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
924 if ((mlib_addr) dp <= (mlib_addr) dend) {
925 emask = vis_edge8(dp, dend);
926 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
927 }
928 }
929 }
930 }
931 }
932}
933
934/***************************************************************/
935void
936mlib_v_ImageChannelInsert_U8_34L(mlib_u8 *src, mlib_s32 slb,
937 mlib_u8 *dst, mlib_s32 dlb,
938 mlib_s32 xsize, mlib_s32 ysize)
939{
940 mlib_u8 *sa, *da;
941 mlib_u8 *sl, *dl;
942 int j;
943
944 sa = sl = src;
945 da = dl = dst;
946
947#pragma pipeloop(0)
948 for (j = 0; j < ysize; j++) {
949 mlib_v_ImageChannelInsert_U8_34L_D1(sa, da, xsize);
950 sa = sl += slb;
951 da = dl += dlb;
952 }
953}
954
955/***************************************************************/
956#define INSERT_S16_34L \
957 dd0 = sd0; /* b0g0r0b1 */ \
958 vis_alignaddr((void *)0, 6); \
959 dd1 = vis_faligndata(sd0, sd1); /* b1gbr1b2 */ \
960 vis_alignaddr((void *)0, 4); \
961 dd2 = vis_faligndata(sd1, sd2); /* b2g2r2b3 */ \
962 vis_alignaddr((void *)0, 2); \
963 dd3 = vis_faligndata(sd2, sd2); /* b3g3r3r2 */
964
965/***************************************************************/
966#define LOAD_INSERT_STORE_S16_34L_A8 \
967 sd0 = *sp++; /* b0g0r0b1 */ \
968 sd1 = *sp++; /* g1r1b2g2 */ \
969 sd2 = *sp++; /* r2b3g3r3 */ \
970 INSERT_S16_34L \
971 vis_pst_16(dd0, dp++, bmask); \
972 vis_pst_16(dd1, dp++, bmask); \
973 vis_pst_16(dd2, dp++, bmask); \
974 vis_pst_16(dd3, dp++, bmask);
975
976/***************************************************************/
977#define LOAD_INSERT_S16_34L \
978 vis_alignaddr((void *)soff, 0); \
979 s0 = s3; \
980 s1 = sp[1]; \
981 s2 = sp[2]; \
982 s3 = sp[3]; \
983 sd0 = vis_faligndata(s0, s1); \
984 sd1 = vis_faligndata(s1, s2); \
985 sd2 = vis_faligndata(s2, s3); \
986 sp += 3; \
987 dd4 = dd3; \
988 INSERT_S16_34L
989
990/***************************************************************/
991/*
992 * both source and destination image data are 1-d vectors and
993 * 8-byte aligned. dsize is multiple of 4.
994 */
995
996void
997mlib_v_ImageChannelInsert_S16_34L_A8D1X4(mlib_s16 *src,
998 mlib_s16 *dst,
999 mlib_s32 dsize)
1000{
1001 mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */
1002 mlib_d64 sd0, sd1, sd2; /* source data */
1003 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
1004 int bmask = 0x0e; /* channel mask */
1005 int i;
1006
1007 sp = (mlib_d64 *)src;
1008 dp = (mlib_d64 *)dst;
1009
1010#pragma pipeloop(0)
1011 for (i = 0; i < dsize / 4; i++) {
1012 LOAD_INSERT_STORE_S16_34L_A8;
1013 }
1014}
1015
1016/***************************************************************/
1017/*
1018 * either source or destination image data are not 1-d vectors, but
1019 * they are 8-byte aligned. xsize is multiple of 4.
1020 */
1021
1022void
1023mlib_v_ImageChannelInsert_S16_34L_A8D2X4(mlib_s16 *src, mlib_s32 slb,
1024 mlib_s16 *dst, mlib_s32 dlb,
1025 mlib_s32 xsize, mlib_s32 ysize)
1026{
1027 mlib_d64 *sp, *dp; /* 8-byte aligned pointer for pixel */
1028 mlib_d64 *sl, *dl; /* 8-byte aligned pointer for line */
1029 mlib_d64 sd0, sd1, sd2; /* source data */
1030 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
1031 int bmask = 0x0e; /* channel mask */
1032 int i, j; /* indices for x, y */
1033
1034 sp = sl = (mlib_d64 *)src;
1035 dp = dl = (mlib_d64 *)dst;
1036
1037 /* row loop */
1038 for (j = 0; j < ysize; j++) {
1039 /* 4-pixel column loop */
1040#pragma pipeloop(0)
1041 for (i = 0; i < xsize / 4; i++) {
1042 LOAD_INSERT_STORE_S16_34L_A8;
1043 }
1044 sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb);
1045 dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb);
1046 }
1047}
1048
1049/***************************************************************/
1050/*
1051 * either source or destination data are not 8-byte aligned.
1052 */
1053
1054void
1055mlib_v_ImageChannelInsert_S16_34L_D1(mlib_s16 *src,
1056 mlib_s16 *dst,
1057 mlib_s32 dsize)
1058{
1059 mlib_s16 *sa, *da; /* pointer for pixel */
1060 mlib_s16 *dend, *dend2; /* end points in dst */
1061 mlib_d64 *dp; /* 8-byte aligned start points in dst */
1062 mlib_d64 *sp; /* 8-byte aligned start point in src */
1063 mlib_d64 s0, s1, s2, s3; /* 8-byte source raw data */
1064 mlib_d64 sd0, sd1, sd2; /* 8-byte source data */
1065 mlib_d64 dd0, dd1, dd2, dd3; /* dst data */
1066 mlib_d64 dd4; /* the last datum of the last step */
1067 int soff; /* offset of address in src */
1068 int doff; /* offset of address in dst */
1069 int emask; /* edge mask */
1070 int bmask; /* channel mask */
1071 int i, n;
1072
1073 sa = src;
1074 da = dst;
1075
1076 /* prepare the source address */
1077 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
1078 soff = ((mlib_addr) sa & 7);
1079
1080 /* prepare the destination addresses */
1081 dp = (mlib_d64 *)((mlib_addr) da & (~7));
1082 dend = da + dsize * 4 - 1;
1083 dend2 = dend - 15;
1084 doff = ((mlib_addr) da & 7);
1085
1086 /* set channel mask for vis_pst_16 to store the words needed */
1087 bmask = 0xff & (0xee >> (doff / 2));
1088
1089 /* generate edge mask for the start point */
1090 emask = vis_edge16(da, dend);
1091
1092 /* load 24 byte, convert, store 32 bytes */
1093 s3 = sp[0]; /* initial value */
1094 LOAD_INSERT_S16_34L;
1095
1096 if (doff == 0) { /* dst is 8-byte aligned */
1097
1098 if (dsize >= 4 ) {
1099 vis_pst_16(dd0, dp++, emask & bmask);
1100 vis_pst_16(dd1, dp++, bmask);
1101 vis_pst_16(dd2, dp++, bmask);
1102 vis_pst_16(dd3, dp++, bmask);
1103 }
1104 else { /* for very small size */
1105 vis_pst_16(dd0, dp++, emask & bmask);
1106 if ((mlib_addr) dp <= (mlib_addr) dend) {
1107 emask = vis_edge16(dp, dend);
1108 vis_pst_16(dd1, dp++, emask & bmask);
1109 if ((mlib_addr) dp <= (mlib_addr) dend) {
1110 emask = vis_edge16(dp, dend);
1111 vis_pst_16(dd2, dp++, emask & bmask);
1112 }
1113 }
1114 }
1115
1116 /* no edge handling is needed in the loop */
1117 if ((mlib_addr) dp <= (mlib_addr) dend2) {
1118 n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
1119#pragma pipeloop(0)
1120 for (i = 0; i < n; i++) {
1121 LOAD_INSERT_S16_34L;
1122 vis_pst_16(dd0, dp++, bmask);
1123 vis_pst_16(dd1, dp++, bmask);
1124 vis_pst_16(dd2, dp++, bmask);
1125 vis_pst_16(dd3, dp++, bmask);
1126 }
1127 }
1128
1129 if ((mlib_addr) dp <= (mlib_addr) dend) {
1130 LOAD_INSERT_S16_34L;
1131 emask = vis_edge16(dp, dend);
1132 vis_pst_16(dd0, dp++, emask & bmask);
1133 if ((mlib_addr) dp <= (mlib_addr) dend) {
1134 emask = vis_edge16(dp, dend);
1135 vis_pst_16(dd1, dp++, emask & bmask);
1136 if ((mlib_addr) dp <= (mlib_addr) dend) {
1137 emask = vis_edge16(dp, dend);
1138 vis_pst_16(dd2, dp++, emask & bmask);
1139 }
1140 }
1141 }
1142 }
1143 else { /* (doff != 0) */
1144 vis_alignaddr((void *)0, -doff);
1145
1146 if (dsize >= 4 ) {
1147 vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask);
1148 vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask);
1149 vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask);
1150 vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask);
1151 }
1152 else { /* for very small size */
1153 vis_pst_16(vis_faligndata(dd0, dd0), dp++, emask & bmask);
1154 if ((mlib_addr) dp <= (mlib_addr) dend) {
1155 emask = vis_edge16(dp, dend);
1156 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask);
1157 if ((mlib_addr) dp <= (mlib_addr) dend) {
1158 emask = vis_edge16(dp, dend);
1159 vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask);
1160 if ((mlib_addr) dp <= (mlib_addr) dend) {
1161 emask = vis_edge16(dp, dend);
1162 vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask);
1163 }
1164 }
1165 }
1166 }
1167
1168 /* no edge handling is needed in the loop */
1169 if ((mlib_addr) dp <= (mlib_addr) dend2) {
1170 n = ((mlib_u8 *)dend2 - (mlib_u8 *)dp) / 32 + 1;
1171#pragma pipeloop(0)
1172 for (i = 0; i < n; i++) {
1173 LOAD_INSERT_S16_34L;
1174 vis_alignaddr((void *)0, -doff);
1175 vis_pst_16(vis_faligndata(dd4, dd0), dp++, bmask);
1176 vis_pst_16(vis_faligndata(dd0, dd1), dp++, bmask);
1177 vis_pst_16(vis_faligndata(dd1, dd2), dp++, bmask);
1178 vis_pst_16(vis_faligndata(dd2, dd3), dp++, bmask);
1179 }
1180 }
1181
1182 if ((mlib_addr) dp <= (mlib_addr) dend) {
1183 LOAD_INSERT_S16_34L;
1184 vis_alignaddr((void *)0, -doff);
1185 emask = vis_edge16(dp, dend);
1186 vis_pst_16(vis_faligndata(dd4, dd0), dp++, emask & bmask);
1187 if ((mlib_addr) dp <= (mlib_addr) dend) {
1188 emask = vis_edge16(dp, dend);
1189 vis_pst_16(vis_faligndata(dd0, dd1), dp++, emask & bmask);
1190 if ((mlib_addr) dp <= (mlib_addr) dend) {
1191 emask = vis_edge16(dp, dend);
1192 vis_pst_16(vis_faligndata(dd1, dd2), dp++, emask & bmask);
1193 if ((mlib_addr) dp <= (mlib_addr) dend) {
1194 emask = vis_edge16(dp, dend);
1195 vis_pst_16(vis_faligndata(dd2, dd3), dp++, emask & bmask);
1196 }
1197 }
1198 }
1199 }
1200 }
1201}
1202
1203/***************************************************************/
1204
1205void
1206mlib_v_ImageChannelInsert_S16_34L(mlib_s16 *src, mlib_s32 slb,
1207 mlib_s16 *dst, mlib_s32 dlb,
1208 mlib_s32 xsize, mlib_s32 ysize)
1209{
1210 mlib_s16 *sa, *da;
1211 mlib_s16 *sl, *dl;
1212 int j;
1213
1214 sa = sl = src;
1215 da = dl = dst;
1216
1217#pragma pipeloop(0)
1218 for (j = 0; j < ysize; j++) {
1219 mlib_v_ImageChannelInsert_S16_34L_D1(sa, da, xsize);
1220 sa = sl = (mlib_s16 *)((mlib_u8 *)sl + slb);
1221 da = dl = (mlib_s16 *)((mlib_u8 *)dl + dlb);
1222 }
1223}
1224
1225/***************************************************************/