Alex Naidis | 6eb7d37 | 2016-10-16 23:10:08 +0200 | [diff] [blame] | 1 | /* |
| 2 | * AltiVec optimizations for libjpeg-turbo |
| 3 | * |
| 4 | * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. |
| 5 | * |
| 6 | * This software is provided 'as-is', without any express or implied |
| 7 | * warranty. In no event will the authors be held liable for any damages |
| 8 | * arising from the use of this software. |
| 9 | * |
| 10 | * Permission is granted to anyone to use this software for any purpose, |
| 11 | * including commercial applications, and to alter it and redistribute it |
| 12 | * freely, subject to the following restrictions: |
| 13 | * |
| 14 | * 1. The origin of this software must not be misrepresented; you must not |
| 15 | * claim that you wrote the original software. If you use this software |
| 16 | * in a product, an acknowledgment in the product documentation would be |
| 17 | * appreciated but is not required. |
| 18 | * 2. Altered source versions must be plainly marked as such, and must not be |
| 19 | * misrepresented as being the original software. |
| 20 | * 3. This notice may not be removed or altered from any source distribution. |
| 21 | */ |
| 22 | |
| 23 | /* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ |
| 24 | |
| 25 | #include "jsimd_altivec.h" |
| 26 | |
| 27 | |
| 28 | /* NOTE: The address will either be aligned or offset by 8 bytes, so we can |
| 29 | * always get the data we want by using a single vector load (although we may |
| 30 | * have to permute the result.) |
| 31 | */ |
| 32 | #if __BIG_ENDIAN__ |
| 33 | |
Leon Scroggins III | 3993b37 | 2018-07-16 10:43:45 -0400 | [diff] [blame^] | 34 | #define LOAD_ROW(row) { \ |
| 35 | elemptr = sample_data[row] + start_col; \ |
| 36 | in##row = vec_ld(0, elemptr); \ |
| 37 | if ((size_t)elemptr & 15) \ |
| 38 | in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \ |
Alex Naidis | 6eb7d37 | 2016-10-16 23:10:08 +0200 | [diff] [blame] | 39 | } |
| 40 | |
| 41 | #else |
| 42 | |
Leon Scroggins III | 3993b37 | 2018-07-16 10:43:45 -0400 | [diff] [blame^] | 43 | #define LOAD_ROW(row) { \ |
| 44 | elemptr = sample_data[row] + start_col; \ |
| 45 | in##row = vec_vsx_ld(0, elemptr); \ |
Alex Naidis | 6eb7d37 | 2016-10-16 23:10:08 +0200 | [diff] [blame] | 46 | } |
| 47 | |
| 48 | #endif |
| 49 | |
| 50 | |
Leon Scroggins III | 3993b37 | 2018-07-16 10:43:45 -0400 | [diff] [blame^] | 51 | void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col, |
| 52 | DCTELEM *workspace) |
Alex Naidis | 6eb7d37 | 2016-10-16 23:10:08 +0200 | [diff] [blame] | 53 | { |
| 54 | JSAMPROW elemptr; |
| 55 | |
| 56 | __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; |
| 57 | __vector short out0, out1, out2, out3, out4, out5, out6, out7; |
| 58 | |
| 59 | /* Constants */ |
| 60 | __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; |
| 61 | __vector unsigned char pb_zero = { __16X(0) }; |
| 62 | |
| 63 | LOAD_ROW(0); |
| 64 | LOAD_ROW(1); |
| 65 | LOAD_ROW(2); |
| 66 | LOAD_ROW(3); |
| 67 | LOAD_ROW(4); |
| 68 | LOAD_ROW(5); |
| 69 | LOAD_ROW(6); |
| 70 | LOAD_ROW(7); |
| 71 | |
| 72 | out0 = (__vector short)VEC_UNPACKHU(in0); |
| 73 | out1 = (__vector short)VEC_UNPACKHU(in1); |
| 74 | out2 = (__vector short)VEC_UNPACKHU(in2); |
| 75 | out3 = (__vector short)VEC_UNPACKHU(in3); |
| 76 | out4 = (__vector short)VEC_UNPACKHU(in4); |
| 77 | out5 = (__vector short)VEC_UNPACKHU(in5); |
| 78 | out6 = (__vector short)VEC_UNPACKHU(in6); |
| 79 | out7 = (__vector short)VEC_UNPACKHU(in7); |
| 80 | |
| 81 | out0 = vec_sub(out0, pw_centerjsamp); |
| 82 | out1 = vec_sub(out1, pw_centerjsamp); |
| 83 | out2 = vec_sub(out2, pw_centerjsamp); |
| 84 | out3 = vec_sub(out3, pw_centerjsamp); |
| 85 | out4 = vec_sub(out4, pw_centerjsamp); |
| 86 | out5 = vec_sub(out5, pw_centerjsamp); |
| 87 | out6 = vec_sub(out6, pw_centerjsamp); |
| 88 | out7 = vec_sub(out7, pw_centerjsamp); |
| 89 | |
| 90 | vec_st(out0, 0, workspace); |
| 91 | vec_st(out1, 16, workspace); |
| 92 | vec_st(out2, 32, workspace); |
| 93 | vec_st(out3, 48, workspace); |
| 94 | vec_st(out4, 64, workspace); |
| 95 | vec_st(out5, 80, workspace); |
| 96 | vec_st(out6, 96, workspace); |
| 97 | vec_st(out7, 112, workspace); |
| 98 | } |
| 99 | |
| 100 | |
Leon Scroggins III | 3993b37 | 2018-07-16 10:43:45 -0400 | [diff] [blame^] | 101 | #define WORD_BIT 16 |
Alex Naidis | 6eb7d37 | 2016-10-16 23:10:08 +0200 | [diff] [blame] | 102 | |
| 103 | /* There is no AltiVec 16-bit unsigned multiply instruction, hence this. |
| 104 | We basically need an unsigned equivalent of vec_madds(). */ |
| 105 | |
Leon Scroggins III | 3993b37 | 2018-07-16 10:43:45 -0400 | [diff] [blame^] | 106 | #define MULTIPLY(vs0, vs1, out) { \ |
| 107 | tmpe = vec_mule((__vector unsigned short)vs0, \ |
| 108 | (__vector unsigned short)vs1); \ |
| 109 | tmpo = vec_mulo((__vector unsigned short)vs0, \ |
| 110 | (__vector unsigned short)vs1); \ |
| 111 | out = (__vector short)vec_perm((__vector unsigned short)tmpe, \ |
| 112 | (__vector unsigned short)tmpo, \ |
| 113 | shift_pack_index); \ |
Alex Naidis | 6eb7d37 | 2016-10-16 23:10:08 +0200 | [diff] [blame] | 114 | } |
| 115 | |
Leon Scroggins III | 3993b37 | 2018-07-16 10:43:45 -0400 | [diff] [blame^] | 116 | void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors, |
| 117 | DCTELEM *workspace) |
Alex Naidis | 6eb7d37 | 2016-10-16 23:10:08 +0200 | [diff] [blame] | 118 | { |
| 119 | __vector short row0, row1, row2, row3, row4, row5, row6, row7, |
| 120 | row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, |
| 121 | corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, |
| 122 | recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, |
| 123 | scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; |
| 124 | __vector unsigned int tmpe, tmpo; |
| 125 | |
| 126 | /* Constants */ |
| 127 | __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; |
| 128 | #if __BIG_ENDIAN__ |
| 129 | __vector unsigned char shift_pack_index = |
Leon Scroggins III | 3993b37 | 2018-07-16 10:43:45 -0400 | [diff] [blame^] | 130 | { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 }; |
Alex Naidis | 6eb7d37 | 2016-10-16 23:10:08 +0200 | [diff] [blame] | 131 | #else |
| 132 | __vector unsigned char shift_pack_index = |
Leon Scroggins III | 3993b37 | 2018-07-16 10:43:45 -0400 | [diff] [blame^] | 133 | { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 }; |
Alex Naidis | 6eb7d37 | 2016-10-16 23:10:08 +0200 | [diff] [blame] | 134 | #endif |
| 135 | |
| 136 | row0 = vec_ld(0, workspace); |
| 137 | row1 = vec_ld(16, workspace); |
| 138 | row2 = vec_ld(32, workspace); |
| 139 | row3 = vec_ld(48, workspace); |
| 140 | row4 = vec_ld(64, workspace); |
| 141 | row5 = vec_ld(80, workspace); |
| 142 | row6 = vec_ld(96, workspace); |
| 143 | row7 = vec_ld(112, workspace); |
| 144 | |
| 145 | /* Branch-less absolute value */ |
| 146 | row0s = vec_sra(row0, pw_word_bit_m1); |
| 147 | row1s = vec_sra(row1, pw_word_bit_m1); |
| 148 | row2s = vec_sra(row2, pw_word_bit_m1); |
| 149 | row3s = vec_sra(row3, pw_word_bit_m1); |
| 150 | row4s = vec_sra(row4, pw_word_bit_m1); |
| 151 | row5s = vec_sra(row5, pw_word_bit_m1); |
| 152 | row6s = vec_sra(row6, pw_word_bit_m1); |
| 153 | row7s = vec_sra(row7, pw_word_bit_m1); |
| 154 | row0 = vec_xor(row0, row0s); |
| 155 | row1 = vec_xor(row1, row1s); |
| 156 | row2 = vec_xor(row2, row2s); |
| 157 | row3 = vec_xor(row3, row3s); |
| 158 | row4 = vec_xor(row4, row4s); |
| 159 | row5 = vec_xor(row5, row5s); |
| 160 | row6 = vec_xor(row6, row6s); |
| 161 | row7 = vec_xor(row7, row7s); |
| 162 | row0 = vec_sub(row0, row0s); |
| 163 | row1 = vec_sub(row1, row1s); |
| 164 | row2 = vec_sub(row2, row2s); |
| 165 | row3 = vec_sub(row3, row3s); |
| 166 | row4 = vec_sub(row4, row4s); |
| 167 | row5 = vec_sub(row5, row5s); |
| 168 | row6 = vec_sub(row6, row6s); |
| 169 | row7 = vec_sub(row7, row7s); |
| 170 | |
| 171 | corr0 = vec_ld(DCTSIZE2 * 2, divisors); |
| 172 | corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors); |
| 173 | corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors); |
| 174 | corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors); |
| 175 | corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors); |
| 176 | corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors); |
| 177 | corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors); |
| 178 | corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors); |
| 179 | |
| 180 | row0 = vec_add(row0, corr0); |
| 181 | row1 = vec_add(row1, corr1); |
| 182 | row2 = vec_add(row2, corr2); |
| 183 | row3 = vec_add(row3, corr3); |
| 184 | row4 = vec_add(row4, corr4); |
| 185 | row5 = vec_add(row5, corr5); |
| 186 | row6 = vec_add(row6, corr6); |
| 187 | row7 = vec_add(row7, corr7); |
| 188 | |
| 189 | recip0 = vec_ld(0, divisors); |
| 190 | recip1 = vec_ld(16, divisors); |
| 191 | recip2 = vec_ld(32, divisors); |
| 192 | recip3 = vec_ld(48, divisors); |
| 193 | recip4 = vec_ld(64, divisors); |
| 194 | recip5 = vec_ld(80, divisors); |
| 195 | recip6 = vec_ld(96, divisors); |
| 196 | recip7 = vec_ld(112, divisors); |
| 197 | |
| 198 | MULTIPLY(row0, recip0, row0); |
| 199 | MULTIPLY(row1, recip1, row1); |
| 200 | MULTIPLY(row2, recip2, row2); |
| 201 | MULTIPLY(row3, recip3, row3); |
| 202 | MULTIPLY(row4, recip4, row4); |
| 203 | MULTIPLY(row5, recip5, row5); |
| 204 | MULTIPLY(row6, recip6, row6); |
| 205 | MULTIPLY(row7, recip7, row7); |
| 206 | |
| 207 | scale0 = vec_ld(DCTSIZE2 * 4, divisors); |
| 208 | scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors); |
| 209 | scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors); |
| 210 | scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors); |
| 211 | scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors); |
| 212 | scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors); |
| 213 | scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors); |
| 214 | scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors); |
| 215 | |
| 216 | MULTIPLY(row0, scale0, row0); |
| 217 | MULTIPLY(row1, scale1, row1); |
| 218 | MULTIPLY(row2, scale2, row2); |
| 219 | MULTIPLY(row3, scale3, row3); |
| 220 | MULTIPLY(row4, scale4, row4); |
| 221 | MULTIPLY(row5, scale5, row5); |
| 222 | MULTIPLY(row6, scale6, row6); |
| 223 | MULTIPLY(row7, scale7, row7); |
| 224 | |
| 225 | row0 = vec_xor(row0, row0s); |
| 226 | row1 = vec_xor(row1, row1s); |
| 227 | row2 = vec_xor(row2, row2s); |
| 228 | row3 = vec_xor(row3, row3s); |
| 229 | row4 = vec_xor(row4, row4s); |
| 230 | row5 = vec_xor(row5, row5s); |
| 231 | row6 = vec_xor(row6, row6s); |
| 232 | row7 = vec_xor(row7, row7s); |
| 233 | row0 = vec_sub(row0, row0s); |
| 234 | row1 = vec_sub(row1, row1s); |
| 235 | row2 = vec_sub(row2, row2s); |
| 236 | row3 = vec_sub(row3, row3s); |
| 237 | row4 = vec_sub(row4, row4s); |
| 238 | row5 = vec_sub(row5, row5s); |
| 239 | row6 = vec_sub(row6, row6s); |
| 240 | row7 = vec_sub(row7, row7s); |
| 241 | |
| 242 | vec_st(row0, 0, coef_block); |
| 243 | vec_st(row1, 16, coef_block); |
| 244 | vec_st(row2, 32, coef_block); |
| 245 | vec_st(row3, 48, coef_block); |
| 246 | vec_st(row4, 64, coef_block); |
| 247 | vec_st(row5, 80, coef_block); |
| 248 | vec_st(row6, 96, coef_block); |
| 249 | vec_st(row7, 112, coef_block); |
| 250 | } |