DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 1 | /* |
| 2 | * AltiVec optimizations for libjpeg-turbo |
| 3 | * |
| 4 | * Copyright (C) 2015, D. R. Commander. |
| 5 | * All rights reserved. |
| 6 | * This software is provided 'as-is', without any express or implied |
| 7 | * warranty. In no event will the authors be held liable for any damages |
| 8 | * arising from the use of this software. |
| 9 | * |
| 10 | * Permission is granted to anyone to use this software for any purpose, |
| 11 | * including commercial applications, and to alter it and redistribute it |
| 12 | * freely, subject to the following restrictions: |
| 13 | * |
| 14 | * 1. The origin of this software must not be misrepresented; you must not |
| 15 | * claim that you wrote the original software. If you use this software |
| 16 | * in a product, an acknowledgment in the product documentation would be |
| 17 | * appreciated but is not required. |
| 18 | * 2. Altered source versions must be plainly marked as such, and must not be |
| 19 | * misrepresented as being the original software. |
| 20 | * 3. This notice may not be removed or altered from any source distribution. |
| 21 | */ |
| 22 | |
| 23 | /* CHROMA UPSAMPLING */ |
| 24 | |
| 25 | #include "jsimd_altivec.h" |
| 26 | |
| 27 | |
| 28 | void |
| 29 | jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor, |
| 30 | JDIMENSION downsampled_width, |
| 31 | JSAMPARRAY input_data, |
| 32 | JSAMPARRAY *output_data_ptr) |
| 33 | { |
| 34 | JSAMPARRAY output_data = *output_data_ptr; |
| 35 | JSAMPROW inptr, outptr; |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 36 | int inrow, incol; |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 37 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 38 | __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0, |
| 39 | out; |
| 40 | __vector short this0e, this0o, this0l, this0h, last0l, last0h, |
| 41 | next0l, next0h, outle, outhe, outlo, outho; |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 42 | |
| 43 | /* Constants */ |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 44 | __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) }, |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 45 | last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}, |
| 46 | last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30}, |
| 47 | next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, |
| 48 | next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15}, |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 49 | #if __BIG_ENDIAN__ |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 50 | merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 51 | #else |
| 52 | merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; |
| 53 | #endif |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 54 | __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; |
| 55 | |
| 56 | for (inrow = 0; inrow < max_v_samp_factor; inrow++) { |
| 57 | inptr = input_data[inrow]; |
| 58 | outptr = output_data[inrow]; |
| 59 | |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 60 | if (downsampled_width & 15) |
| 61 | inptr[downsampled_width] = inptr[downsampled_width - 1]; |
| 62 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 63 | this0 = vec_ld(0, inptr); |
| 64 | p_last0 = vec_perm(this0, this0, last_index_col0); |
| 65 | last0 = this0; |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 66 | |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 67 | for (incol = downsampled_width; incol > 0; |
| 68 | incol -= 16, inptr += 16, outptr += 32) { |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 69 | |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 70 | if (downsampled_width - incol > 0) { |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 71 | p_last0 = vec_perm(last0, this0, last_index); |
| 72 | last0 = this0; |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 73 | } |
| 74 | |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 75 | if (incol <= 16) |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 76 | p_next0 = vec_perm(this0, this0, next_index_lastcol); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 77 | else { |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 78 | next0 = vec_ld(16, inptr); |
| 79 | p_next0 = vec_perm(this0, next0, next_index); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 80 | } |
| 81 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 82 | this0e = (__vector short)vec_mule(this0, pb_three); |
| 83 | this0o = (__vector short)vec_mulo(this0, pb_three); |
| 84 | this0l = vec_mergeh(this0e, this0o); |
| 85 | this0h = vec_mergel(this0e, this0o); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 86 | |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 87 | last0l = (__vector short)VEC_UNPACKHU(p_last0); |
| 88 | last0h = (__vector short)VEC_UNPACKLU(p_last0); |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 89 | last0l = vec_add(last0l, pw_one); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 90 | |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 91 | next0l = (__vector short)VEC_UNPACKHU(p_next0); |
| 92 | next0h = (__vector short)VEC_UNPACKLU(p_next0); |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 93 | next0l = vec_add(next0l, pw_two); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 94 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 95 | outle = vec_add(this0l, last0l); |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 96 | outlo = vec_add(this0l, next0l); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 97 | outle = vec_sr(outle, (__vector unsigned short)pw_two); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 98 | outlo = vec_sr(outlo, (__vector unsigned short)pw_two); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 99 | |
| 100 | out = vec_perm((__vector unsigned char)outle, |
| 101 | (__vector unsigned char)outlo, merge_pack_index); |
| 102 | vec_st(out, 0, outptr); |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 103 | |
| 104 | if (incol > 8) { |
| 105 | last0h = vec_add(last0h, pw_one); |
| 106 | next0h = vec_add(next0h, pw_two); |
| 107 | |
| 108 | outhe = vec_add(this0h, last0h); |
| 109 | outho = vec_add(this0h, next0h); |
| 110 | outhe = vec_sr(outhe, (__vector unsigned short)pw_two); |
| 111 | outho = vec_sr(outho, (__vector unsigned short)pw_two); |
| 112 | |
| 113 | out = vec_perm((__vector unsigned char)outhe, |
| 114 | (__vector unsigned char)outho, merge_pack_index); |
| 115 | vec_st(out, 16, outptr); |
| 116 | } |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 117 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 118 | this0 = next0; |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 119 | } |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | |
| 124 | void |
| 125 | jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor, |
| 126 | JDIMENSION downsampled_width, |
| 127 | JSAMPARRAY input_data, |
| 128 | JSAMPARRAY *output_data_ptr) |
| 129 | { |
| 130 | JSAMPARRAY output_data = *output_data_ptr; |
| 131 | JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 132 | int inrow, outrow, incol; |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 133 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 134 | __vector unsigned char this_1, this0, this1, out; |
| 135 | __vector short this_1l, this_1h, this0l, this0h, this1l, this1h, |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 136 | lastcolsum_1h, lastcolsum1h, |
| 137 | p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, |
| 138 | thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, |
| 139 | nextcolsum_1l = {0}, nextcolsum_1h = {0}, |
| 140 | nextcolsum1l = {0}, nextcolsum1h = {0}, |
| 141 | p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, |
| 142 | tmpl, tmph, outle, outhe, outlo, outho; |
| 143 | |
| 144 | /* Constants */ |
| 145 | __vector unsigned char pb_zero = { __16X(0) }, |
| 146 | last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, |
| 147 | last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29}, |
| 148 | next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17}, |
| 149 | next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15}, |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 150 | #if __BIG_ENDIAN__ |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 151 | merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31}; |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 152 | #else |
| 153 | merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30}; |
| 154 | #endif |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 155 | __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, |
| 156 | pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; |
| 157 | __vector unsigned short pw_four = { __8X(4) }; |
| 158 | |
| 159 | for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { |
| 160 | |
| 161 | inptr_1 = input_data[inrow - 1]; |
| 162 | inptr0 = input_data[inrow]; |
| 163 | inptr1 = input_data[inrow + 1]; |
| 164 | outptr0 = output_data[outrow++]; |
| 165 | outptr1 = output_data[outrow++]; |
| 166 | |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 167 | if (downsampled_width & 15) { |
| 168 | inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; |
| 169 | inptr0[downsampled_width] = inptr0[downsampled_width - 1]; |
| 170 | inptr1[downsampled_width] = inptr1[downsampled_width - 1]; |
| 171 | } |
| 172 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 173 | this0 = vec_ld(0, inptr0); |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 174 | this0l = (__vector short)VEC_UNPACKHU(this0); |
| 175 | this0h = (__vector short)VEC_UNPACKLU(this0); |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 176 | this0l = vec_mladd(this0l, pw_three, pw_zero); |
| 177 | this0h = vec_mladd(this0h, pw_three, pw_zero); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 178 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 179 | this_1 = vec_ld(0, inptr_1); |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 180 | this_1l = (__vector short)VEC_UNPACKHU(this_1); |
| 181 | this_1h = (__vector short)VEC_UNPACKLU(this_1); |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 182 | thiscolsum_1l = vec_add(this0l, this_1l); |
| 183 | thiscolsum_1h = vec_add(this0h, this_1h); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 184 | lastcolsum_1h = thiscolsum_1h; |
| 185 | p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0); |
| 186 | p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); |
| 187 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 188 | this1 = vec_ld(0, inptr1); |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 189 | this1l = (__vector short)VEC_UNPACKHU(this1); |
| 190 | this1h = (__vector short)VEC_UNPACKLU(this1); |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 191 | thiscolsum1l = vec_add(this0l, this1l); |
| 192 | thiscolsum1h = vec_add(this0h, this1h); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 193 | lastcolsum1h = thiscolsum1h; |
| 194 | p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0); |
| 195 | p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); |
| 196 | |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 197 | for (incol = downsampled_width; incol > 0; |
| 198 | incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16, |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 199 | outptr0 += 32, outptr1 += 32) { |
| 200 | |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 201 | if (downsampled_width - incol > 0) { |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 202 | p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index); |
| 203 | p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); |
| 204 | p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index); |
| 205 | p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); |
| 206 | lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h; |
| 207 | } |
| 208 | |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 209 | if (incol <= 16) { |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 210 | p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); |
| 211 | p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h, |
| 212 | next_index_lastcol); |
| 213 | p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); |
| 214 | p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h, |
| 215 | next_index_lastcol); |
| 216 | } else { |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 217 | this0 = vec_ld(16, inptr0); |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 218 | this0l = (__vector short)VEC_UNPACKHU(this0); |
| 219 | this0h = (__vector short)VEC_UNPACKLU(this0); |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 220 | this0l = vec_mladd(this0l, pw_three, pw_zero); |
| 221 | this0h = vec_mladd(this0h, pw_three, pw_zero); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 222 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 223 | this_1 = vec_ld(16, inptr_1); |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 224 | this_1l = (__vector short)VEC_UNPACKHU(this_1); |
| 225 | this_1h = (__vector short)VEC_UNPACKLU(this_1); |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 226 | nextcolsum_1l = vec_add(this0l, this_1l); |
| 227 | nextcolsum_1h = vec_add(this0h, this_1h); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 228 | p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); |
| 229 | p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); |
| 230 | |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 231 | this1 = vec_ld(16, inptr1); |
DRC | 771ab19 | 2015-02-20 19:57:21 +0000 | [diff] [blame] | 232 | this1l = (__vector short)VEC_UNPACKHU(this1); |
| 233 | this1h = (__vector short)VEC_UNPACKLU(this1); |
DRC | a6a24c2 | 2015-01-13 10:00:12 +0000 | [diff] [blame] | 234 | nextcolsum1l = vec_add(this0l, this1l); |
| 235 | nextcolsum1h = vec_add(this0h, this1h); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 236 | p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); |
| 237 | p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index); |
| 238 | } |
| 239 | |
| 240 | /* Process the upper row */ |
| 241 | |
| 242 | tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 243 | outle = vec_add(tmpl, p_lastcolsum_1l); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 244 | outle = vec_add(outle, pw_eight); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 245 | outle = vec_sr(outle, pw_four); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 246 | |
| 247 | outlo = vec_add(tmpl, p_nextcolsum_1l); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 248 | outlo = vec_add(outlo, pw_seven); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 249 | outlo = vec_sr(outlo, pw_four); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 250 | |
| 251 | out = vec_perm((__vector unsigned char)outle, |
| 252 | (__vector unsigned char)outlo, merge_pack_index); |
| 253 | vec_st(out, 0, outptr0); |
| 254 | |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 255 | if (incol > 8) { |
| 256 | tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero); |
| 257 | outhe = vec_add(tmph, p_lastcolsum_1h); |
| 258 | outhe = vec_add(outhe, pw_eight); |
| 259 | outhe = vec_sr(outhe, pw_four); |
| 260 | |
| 261 | outho = vec_add(tmph, p_nextcolsum_1h); |
| 262 | outho = vec_add(outho, pw_seven); |
| 263 | outho = vec_sr(outho, pw_four); |
| 264 | |
| 265 | out = vec_perm((__vector unsigned char)outhe, |
| 266 | (__vector unsigned char)outho, merge_pack_index); |
| 267 | vec_st(out, 16, outptr0); |
| 268 | } |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 269 | |
| 270 | /* Process the lower row */ |
| 271 | |
| 272 | tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 273 | outle = vec_add(tmpl, p_lastcolsum1l); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 274 | outle = vec_add(outle, pw_eight); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 275 | outle = vec_sr(outle, pw_four); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 276 | |
| 277 | outlo = vec_add(tmpl, p_nextcolsum1l); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 278 | outlo = vec_add(outlo, pw_seven); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 279 | outlo = vec_sr(outlo, pw_four); |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 280 | |
| 281 | out = vec_perm((__vector unsigned char)outle, |
| 282 | (__vector unsigned char)outlo, merge_pack_index); |
| 283 | vec_st(out, 0, outptr1); |
DRC | 2517ef7 | 2015-01-14 10:45:31 +0000 | [diff] [blame] | 284 | |
| 285 | if (incol > 8) { |
| 286 | tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero); |
| 287 | outhe = vec_add(tmph, p_lastcolsum1h); |
| 288 | outhe = vec_add(outhe, pw_eight); |
| 289 | outhe = vec_sr(outhe, pw_four); |
| 290 | |
| 291 | outho = vec_add(tmph, p_nextcolsum1h); |
| 292 | outho = vec_add(outho, pw_seven); |
| 293 | outho = vec_sr(outho, pw_four); |
| 294 | |
| 295 | out = vec_perm((__vector unsigned char)outhe, |
| 296 | (__vector unsigned char)outho, merge_pack_index); |
| 297 | vec_st(out, 16, outptr1); |
| 298 | } |
DRC | 52a4ec6 | 2015-01-13 09:02:29 +0000 | [diff] [blame] | 299 | |
| 300 | thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; |
| 301 | thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; |
| 302 | } |
| 303 | } |
| 304 | } |
DRC | c641cdd | 2015-01-14 15:41:11 +0000 | [diff] [blame] | 305 | |
| 306 | |
| 307 | /* These are rarely used (mainly just for decompressing YCCK images) */ |
| 308 | |
| 309 | void |
| 310 | jsimd_h2v1_upsample_altivec (int max_v_samp_factor, |
| 311 | JDIMENSION output_width, |
| 312 | JSAMPARRAY input_data, |
DRC | bd49803 | 2016-02-19 08:53:33 -0600 | [diff] [blame] | 313 | JSAMPARRAY *output_data_ptr) |
DRC | c641cdd | 2015-01-14 15:41:11 +0000 | [diff] [blame] | 314 | { |
| 315 | JSAMPARRAY output_data = *output_data_ptr; |
| 316 | JSAMPROW inptr, outptr; |
| 317 | int inrow, incol; |
| 318 | |
| 319 | __vector unsigned char in, inl, inh; |
| 320 | |
| 321 | for (inrow = 0; inrow < max_v_samp_factor; inrow++) { |
| 322 | inptr = input_data[inrow]; |
| 323 | outptr = output_data[inrow]; |
| 324 | |
| 325 | for (incol = (output_width + 31) & (~31); incol > 0; |
| 326 | incol -= 64, inptr += 32, outptr += 64) { |
| 327 | |
| 328 | in = vec_ld(0, inptr); |
| 329 | inl = vec_mergeh(in, in); |
| 330 | inh = vec_mergel(in, in); |
| 331 | |
| 332 | vec_st(inl, 0, outptr); |
| 333 | vec_st(inh, 16, outptr); |
| 334 | |
| 335 | if (incol > 32) { |
| 336 | in = vec_ld(16, inptr); |
| 337 | inl = vec_mergeh(in, in); |
| 338 | inh = vec_mergel(in, in); |
| 339 | |
| 340 | vec_st(inl, 32, outptr); |
| 341 | vec_st(inh, 48, outptr); |
| 342 | } |
| 343 | } |
| 344 | } |
| 345 | } |
| 346 | |
| 347 | |
| 348 | void |
| 349 | jsimd_h2v2_upsample_altivec (int max_v_samp_factor, |
| 350 | JDIMENSION output_width, |
| 351 | JSAMPARRAY input_data, |
DRC | bd49803 | 2016-02-19 08:53:33 -0600 | [diff] [blame] | 352 | JSAMPARRAY *output_data_ptr) |
DRC | c641cdd | 2015-01-14 15:41:11 +0000 | [diff] [blame] | 353 | { |
| 354 | JSAMPARRAY output_data = *output_data_ptr; |
| 355 | JSAMPROW inptr, outptr0, outptr1; |
| 356 | int inrow, outrow, incol; |
| 357 | |
| 358 | __vector unsigned char in, inl, inh; |
| 359 | |
| 360 | for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { |
| 361 | |
| 362 | inptr = input_data[inrow]; |
| 363 | outptr0 = output_data[outrow++]; |
| 364 | outptr1 = output_data[outrow++]; |
| 365 | |
| 366 | for (incol = (output_width + 31) & (~31); incol > 0; |
| 367 | incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { |
| 368 | |
| 369 | in = vec_ld(0, inptr); |
| 370 | inl = vec_mergeh(in, in); |
| 371 | inh = vec_mergel(in, in); |
| 372 | |
| 373 | vec_st(inl, 0, outptr0); |
| 374 | vec_st(inl, 0, outptr1); |
| 375 | |
| 376 | vec_st(inh, 16, outptr0); |
| 377 | vec_st(inh, 16, outptr1); |
| 378 | |
| 379 | if (incol > 32) { |
| 380 | in = vec_ld(16, inptr); |
| 381 | inl = vec_mergeh(in, in); |
| 382 | inh = vec_mergel(in, in); |
| 383 | |
| 384 | vec_st(inl, 32, outptr0); |
| 385 | vec_st(inl, 32, outptr1); |
| 386 | |
| 387 | vec_st(inh, 48, outptr0); |
| 388 | vec_st(inh, 48, outptr1); |
| 389 | } |
| 390 | } |
| 391 | } |
| 392 | } |