| /* |
| * Copyright (C) 2011 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <stdint.h> |
| #include <x86intrin.h> |
| |
| /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */ |
| static inline __m128i cvtepu8_epi32(__m128i x) { |
| #if defined(__SSE4_1__) |
| return _mm_cvtepu8_epi32(x); |
| #elif defined(__SSSE3__) |
| const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00); |
| x = _mm_shuffle_epi8(x, M8to32); |
| return x; |
| #else |
| # error "Require at least SSSE3" |
| #endif |
| } |
| |
| static inline __m128i packus_epi32(__m128i lo, __m128i hi) { |
| #if defined(__SSE4_1__) |
| return _mm_packus_epi32(lo, hi); |
| #elif defined(__SSSE3__) |
| const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000); |
| const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff); |
| const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100); |
| const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff); |
| lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0)); |
| lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1)); |
| hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0)); |
| hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1)); |
| return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L), |
| _mm_shuffle_epi8(hi, M32to16H)); |
| #else |
| # error "Require at least SSSE3" |
| #endif |
| } |
| |
| static inline __m128i mullo_epi32(__m128i x, __m128i y) { |
| #if defined(__SSE4_1__) |
| return _mm_mullo_epi32(x, y); |
| #elif defined(__SSSE3__) |
| const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff); |
| __m128i even = _mm_mul_epu32(x, y); |
| __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4), |
| _mm_srli_si128(y, 4)); |
| even = _mm_and_si128(even, Meven); |
| odd = _mm_and_si128(odd, Meven); |
| return _mm_or_si128(even, _mm_slli_si128(odd, 4)); |
| #else |
| # error "Require at least SSSE3" |
| #endif |
| } |
| |
| /* 'mask' must packed 8-bit of 0x00 or 0xff */ |
| static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) { |
| #if defined(__SSE4_1__) |
| return _mm_blendv_epi8(x, y, mask); |
| #elif defined(__SSSE3__) |
| return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask)); |
| #else |
| # error "Require at least SSSE3" |
| #endif |
| } |
| |
| extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, |
| const void *y1, const void *y2, |
| const short *coef, uint32_t count) { |
| __m128i x; |
| __m128i c0, c2, c4, c6, c8; |
| __m128i r0, r1, r2; |
| __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11; |
| __m128i o0, o1; |
| uint32_t i; |
| |
| x = _mm_loadl_epi64((const __m128i *)(coef+0)); |
| c0 = _mm_shuffle_epi32(x, 0x00); |
| c2 = _mm_shuffle_epi32(x, 0x55); |
| x = _mm_loadl_epi64((const __m128i *)(coef+4)); |
| c4 = _mm_shuffle_epi32(x, 0x00); |
| c6 = _mm_shuffle_epi32(x, 0x55); |
| x = _mm_loadl_epi64((const __m128i *)(coef+8)); |
| c8 = _mm_shuffle_epi32(x, 0x00); |
| |
| for (i = 0; i < count; ++i) { |
| |
| p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128()); |
| p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128()); |
| p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128()); |
| p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128()); |
| p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128()); |
| p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128()); |
| p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128()); |
| p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128()); |
| p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128()); |
| p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128()); |
| p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128()); |
| p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128()); |
| |
| o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0); |
| o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0); |
| |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2)); |
| |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4)); |
| |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6)); |
| |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8)); |
| |
| o0 = _mm_srai_epi32(o0, 8); |
| o1 = _mm_srai_epi32(o1, 8); |
| |
| o0 = packus_epi32(o0, o1); |
| o0 = _mm_packus_epi16(o0, o0); |
| _mm_storel_epi64((__m128i *)dst, o0); |
| |
| y0 = (const char *)y0 + 8; |
| y1 = (const char *)y1 + 8; |
| y2 = (const char *)y2 + 8; |
| dst = (char *)dst + 8; |
| } |
| } |
| |
| void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, |
| const short *coef, uint32_t count) { |
| const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, |
| 14, 10, 6, 2, |
| 13, 9, 5, 1, |
| 12, 8, 4, 0); |
| |
| const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); |
| const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); |
| __m128i c0, c1, c2, c3; |
| __m128i i4, o4; |
| __m128i xy, zw; |
| __m128i x2, y2, z2, w2; |
| uint32_t i; |
| |
| c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); |
| c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); |
| c0 = _mm_unpacklo_epi16(c0, c1); |
| |
| c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); |
| c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); |
| c2 = _mm_unpacklo_epi16(c2, c3); |
| |
| for (i = 0; i < count; ++i) { |
| i4 = _mm_load_si128((const __m128i *)src); |
| xy = _mm_shuffle_epi8(i4, Mxy); |
| zw = _mm_shuffle_epi8(i4, Mzw); |
| |
| x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00)); |
| y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55)); |
| z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa)); |
| w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff)); |
| |
| x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00))); |
| y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55))); |
| z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa))); |
| w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff))); |
| |
| x2 = _mm_srai_epi32(x2, 8); |
| y2 = _mm_srai_epi32(y2, 8); |
| z2 = _mm_srai_epi32(z2, 8); |
| w2 = _mm_srai_epi32(w2, 8); |
| |
| x2 = packus_epi32(x2, y2); |
| z2 = packus_epi32(z2, w2); |
| o4 = _mm_packus_epi16(x2, z2); |
| |
| o4 = _mm_shuffle_epi8(o4, T4x4); |
| _mm_storeu_si128((__m128i *)dst, o4); |
| |
| src = (const char *)src + 16; |
| dst = (char *)dst + 16; |
| } |
| } |
| |
| void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, |
| const short *coef, uint32_t count) { |
| const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, |
| 14, 10, 6, 2, |
| 13, 9, 5, 1, |
| 12, 8, 4, 0); |
| |
| const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); |
| const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); |
| |
| __m128i c0, c1, c2, c3; |
| __m128i i4, o4; |
| __m128i xy, zw; |
| __m128i x2, y2, z2, w2; |
| uint32_t i; |
| |
| c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); |
| c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); |
| c0 = _mm_unpacklo_epi16(c0, c1); |
| |
| c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); |
| c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); |
| c2 = _mm_unpacklo_epi16(c2, c3); |
| |
| for (i = 0; i < count; ++i) { |
| i4 = _mm_loadu_si128((const __m128i *)src); |
| xy = _mm_shuffle_epi8(i4, Mxy); |
| zw = _mm_shuffle_epi8(i4, Mzw); |
| |
| x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00)); |
| y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55)); |
| z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa)); |
| |
| x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00))); |
| y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55))); |
| z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa))); |
| |
| x2 = _mm_srai_epi32(x2, 8); |
| y2 = _mm_srai_epi32(y2, 8); |
| z2 = _mm_srai_epi32(z2, 8); |
| w2 = _mm_srli_epi32(zw, 16); |
| |
| x2 = packus_epi32(x2, y2); |
| z2 = packus_epi32(z2, w2); |
| o4 = _mm_packus_epi16(x2, z2); |
| |
| o4 = _mm_shuffle_epi8(o4, T4x4); |
| _mm_storeu_si128((__m128i *)dst, o4); |
| |
| src = (const char *)src + 16; |
| dst = (char *)dst + 16; |
| } |
| } |
| |
| void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, |
| const short *coef, uint32_t count) { |
| const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, |
| 14, 10, 6, 2, |
| 13, 9, 5, 1, |
| 12, 8, 4, 0); |
| const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); |
| const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); |
| __m128i c0, c1, c2, c3; |
| __m128i i4, o4; |
| __m128i xy, zw; |
| __m128i x2, y2, z2, w2; |
| uint32_t i; |
| |
| c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); |
| c0 = _mm_shufflelo_epi16(c0, 0); |
| c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); |
| c1 = _mm_shufflelo_epi16(c1, 0); |
| c0 = _mm_unpacklo_epi16(c0, c1); |
| |
| c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); |
| c2 = _mm_shufflelo_epi16(c2, 0); |
| c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); |
| c3 = _mm_shufflelo_epi16(c3, 0); |
| c2 = _mm_unpacklo_epi16(c2, c3); |
| |
| for (i = 0; i < count; ++i) { |
| i4 = _mm_loadu_si128((const __m128i *)src); |
| |
| xy = _mm_shuffle_epi8(i4, Mxy); |
| zw = _mm_shuffle_epi8(i4, Mzw); |
| |
| x2 = _mm_madd_epi16(xy, c0); |
| x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2)); |
| |
| x2 = _mm_srai_epi32(x2, 8); |
| y2 = x2; |
| z2 = x2; |
| w2 = _mm_srli_epi32(zw, 16); |
| |
| x2 = packus_epi32(x2, y2); |
| z2 = packus_epi32(z2, w2); |
| o4 = _mm_packus_epi16(x2, z2); |
| |
| o4 = _mm_shuffle_epi8(o4, T4x4); |
| _mm_storeu_si128((__m128i *)dst, o4); |
| |
| src = (const char *)src + 16; |
| dst = (char *)dst + 16; |
| } |
| } |
| |
| void rsdIntrinsicBlurVFU4_K(void *dst, |
| const void *pin, int stride, const void *gptr, |
| int rct, int x1, int x2) { |
| const char *pi; |
| __m128i pi0, pi1; |
| __m128 pf0, pf1; |
| __m128 bp0, bp1; |
| __m128 x; |
| int r; |
| |
| for (; x1 < x2; x1 += 2) { |
| pi = (const char *)pin + (x1 << 2); |
| bp0 = _mm_setzero_ps(); |
| bp1 = _mm_setzero_ps(); |
| |
| for (r = 0; r < rct; ++r) { |
| x = _mm_load_ss((const float *)gptr + r); |
| x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); |
| |
| pi0 = _mm_cvtsi32_si128(*(const int *)pi); |
| pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1)); |
| |
| pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0)); |
| pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1)); |
| |
| bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x)); |
| bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x)); |
| |
| pi += stride; |
| } |
| |
| _mm_storeu_ps((float *)dst, bp0); |
| _mm_storeu_ps((float *)dst + 4, bp1); |
| dst = (char *)dst + 32; |
| } |
| } |
| |
| void rsdIntrinsicBlurHFU4_K(void *dst, |
| const void *pin, const void *gptr, |
| int rct, int x1, int x2) { |
| const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400); |
| const float *pi; |
| __m128 pf, x, y; |
| __m128i o; |
| int r; |
| |
| for (; x1 < x2; ++x1) { |
| /* rct is define as 2*r+1 by the caller */ |
| x = _mm_load_ss((const float *)gptr); |
| x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); |
| |
| pi = (const float *)pin + (x1 << 2); |
| pf = _mm_mul_ps(x, _mm_load_ps(pi)); |
| |
| for (r = 1; r < rct; r += 2) { |
| x = _mm_load_ss((const float *)gptr + r); |
| y = _mm_load_ss((const float *)gptr + r + 1); |
| x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); |
| y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0)); |
| |
| pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2)))); |
| pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4))); |
| } |
| |
| o = _mm_cvtps_epi32(pf); |
| *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8)); |
| dst = (char *)dst + 4; |
| } |
| } |
| |
| void rsdIntrinsicBlurHFU1_K(void *dst, |
| const void *pin, const void *gptr, |
| int rct, int x1, int x2) { |
| const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400); |
| const float *pi; |
| __m128 pf, g0, g1, g2, g3, gx, p0, p1; |
| __m128i o; |
| int r; |
| |
| for (; x1 < x2; x1+=4) { |
| g0 = _mm_load_ss((const float *)gptr); |
| g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0)); |
| |
| pi = (const float *)pin + x1; |
| pf = _mm_mul_ps(g0, _mm_loadu_ps(pi)); |
| |
| for (r = 1; r < rct; r += 4) { |
| gx = _mm_loadu_ps((const float *)gptr + r); |
| p0 = _mm_loadu_ps(pi + r); |
| p1 = _mm_loadu_ps(pi + r + 4); |
| |
| g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0)); |
| pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0)); |
| g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1)); |
| pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4))); |
| g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2)); |
| pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8))); |
| g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3)); |
| pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12))); |
| } |
| |
| o = _mm_cvtps_epi32(pf); |
| *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8)); |
| dst = (char *)dst + 4; |
| } |
| } |
| |
| void rsdIntrinsicYuv_K(void *dst, |
| const unsigned char *pY, const unsigned char *pUV, |
| uint32_t count, const short *param) { |
| __m128i biasY, biasUV; |
| __m128i c0, c1, c2, c3, c4; |
| |
| biasY = _mm_set1_epi32(param[8]); /* 16 */ |
| biasUV = _mm_set1_epi32(param[16]); /* 128 */ |
| |
| c0 = _mm_set1_epi32(param[0]); /* 298 */ |
| c1 = _mm_set1_epi32(param[1]); /* 409 */ |
| c2 = _mm_set1_epi32(param[2]); /* -100 */ |
| c3 = _mm_set1_epi32(param[3]); /* 516 */ |
| c4 = _mm_set1_epi32(param[4]); /* -208 */ |
| |
| __m128i Y, UV, U, V, R, G, B, A; |
| |
| A = _mm_set1_epi32(255); |
| uint32_t i; |
| |
| for (i = 0; i < (count << 1); ++i) { |
| Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); |
| UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV)); |
| |
| Y = _mm_sub_epi32(Y, biasY); |
| UV = _mm_sub_epi32(UV, biasUV); |
| |
| U = _mm_shuffle_epi32(UV, 0xf5); |
| V = _mm_shuffle_epi32(UV, 0xa0); |
| |
| Y = mullo_epi32(Y, c0); |
| |
| R = _mm_add_epi32(Y, mullo_epi32(V, c1)); |
| R = _mm_add_epi32(R, biasUV); |
| R = _mm_srai_epi32(R, 8); |
| |
| G = _mm_add_epi32(Y, mullo_epi32(U, c2)); |
| G = _mm_add_epi32(G, mullo_epi32(V, c4)); |
| G = _mm_add_epi32(G, biasUV); |
| G = _mm_srai_epi32(G, 8); |
| |
| B = _mm_add_epi32(Y, mullo_epi32(U, c3)); |
| B = _mm_add_epi32(B, biasUV); |
| B = _mm_srai_epi32(B, 8); |
| |
| __m128i y1, y2, y3, y4; |
| |
| y1 = packus_epi32(R, G); |
| y2 = packus_epi32(B, A); |
| y3 = _mm_packus_epi16(y1, y2); |
| const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, |
| 14, 10, 6, 2, |
| 13, 9, 5, 1, |
| 12, 8, 4, 0); |
| y4 = _mm_shuffle_epi8(y3, T4x4); |
| _mm_storeu_si128((__m128i *)dst, y4); |
| pY += 4; |
| pUV += 4; |
| dst = (__m128i *)dst + 1; |
| } |
| } |
| |
| void rsdIntrinsicYuvR_K(void *dst, |
| const unsigned char *pY, const unsigned char *pUV, |
| uint32_t count, const short *param) { |
| __m128i biasY, biasUV; |
| __m128i c0, c1, c2, c3, c4; |
| |
| biasY = _mm_set1_epi32(param[8]); /* 16 */ |
| biasUV = _mm_set1_epi32(param[16]); /* 128 */ |
| |
| c0 = _mm_set1_epi32(param[0]); /* 298 */ |
| c1 = _mm_set1_epi32(param[1]); /* 409 */ |
| c2 = _mm_set1_epi32(param[2]); /* -100 */ |
| c3 = _mm_set1_epi32(param[3]); /* 516 */ |
| c4 = _mm_set1_epi32(param[4]); /* -208 */ |
| |
| __m128i Y, UV, U, V, R, G, B, A; |
| |
| A = _mm_set1_epi32(255); |
| uint32_t i; |
| |
| for (i = 0; i < (count << 1); ++i) { |
| Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); |
| UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV)); |
| |
| Y = _mm_sub_epi32(Y, biasY); |
| UV = _mm_sub_epi32(UV, biasUV); |
| |
| V = _mm_shuffle_epi32(UV, 0xf5); |
| U = _mm_shuffle_epi32(UV, 0xa0); |
| |
| Y = mullo_epi32(Y, c0); |
| |
| R = _mm_add_epi32(Y, mullo_epi32(V, c1)); |
| R = _mm_add_epi32(R, biasUV); |
| R = _mm_srai_epi32(R, 8); |
| |
| G = _mm_add_epi32(Y, mullo_epi32(U, c2)); |
| G = _mm_add_epi32(G, mullo_epi32(V, c4)); |
| G = _mm_add_epi32(G, biasUV); |
| G = _mm_srai_epi32(G, 8); |
| |
| B = _mm_add_epi32(Y, mullo_epi32(U, c3)); |
| B = _mm_add_epi32(B, biasUV); |
| B = _mm_srai_epi32(B, 8); |
| |
| __m128i y1, y2, y3, y4; |
| |
| y1 = packus_epi32(R, G); |
| y2 = packus_epi32(B, A); |
| y3 = _mm_packus_epi16(y1, y2); |
| const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, |
| 14, 10, 6, 2, |
| 13, 9, 5, 1, |
| 12, 8, 4, 0); |
| y4 = _mm_shuffle_epi8(y3, T4x4); |
| _mm_storeu_si128((__m128i *)dst, y4); |
| pY += 4; |
| pUV += 4; |
| dst = (__m128i *)dst + 1; |
| } |
| } |
| |
| void rsdIntrinsicYuv2_K(void *dst, |
| const unsigned char *pY, const unsigned char *pU, |
| const unsigned char *pV, uint32_t count, const short *param) { |
| __m128i biasY, biasUV; |
| __m128i c0, c1, c2, c3, c4; |
| |
| biasY = _mm_set1_epi32(param[8]); /* 16 */ |
| biasUV = _mm_set1_epi32(param[16]); /* 128 */ |
| |
| c0 = _mm_set1_epi32(param[0]); /* 298 */ |
| c1 = _mm_set1_epi32(param[1]); /* 409 */ |
| c2 = _mm_set1_epi32(param[2]); /* -100 */ |
| c3 = _mm_set1_epi32(param[3]); /* 516 */ |
| c4 = _mm_set1_epi32(param[4]); /* -208 */ |
| |
| __m128i Y, U, V, R, G, B, A; |
| |
| A = _mm_set1_epi32(255); |
| uint32_t i; |
| |
| for (i = 0; i < (count << 1); ++i) { |
| Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); |
| U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU)); |
| V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV)); |
| |
| Y = _mm_sub_epi32(Y, biasY); |
| U = _mm_sub_epi32(U, biasUV); |
| V = _mm_sub_epi32(V, biasUV); |
| |
| Y = mullo_epi32(Y, c0); |
| |
| R = _mm_add_epi32(Y, mullo_epi32(V, c1)); |
| R = _mm_add_epi32(R, biasUV); |
| R = _mm_srai_epi32(R, 8); |
| |
| G = _mm_add_epi32(Y, mullo_epi32(U, c2)); |
| G = _mm_add_epi32(G, mullo_epi32(V, c4)); |
| G = _mm_add_epi32(G, biasUV); |
| G = _mm_srai_epi32(G, 8); |
| |
| B = _mm_add_epi32(Y, mullo_epi32(U, c3)); |
| B = _mm_add_epi32(B, biasUV); |
| B = _mm_srai_epi32(B, 8); |
| |
| __m128i y1, y2, y3, y4; |
| |
| y1 = packus_epi32(R, G); |
| y2 = packus_epi32(B, A); |
| y3 = _mm_packus_epi16(y1, y2); |
| const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, |
| 14, 10, 6, 2, |
| 13, 9, 5, 1, |
| 12, 8, 4, 0); |
| y4 = _mm_shuffle_epi8(y3, T4x4); |
| _mm_storeu_si128((__m128i *)dst, y4); |
| pY += 4; |
| pU += 4; |
| pV += 4; |
| dst = (__m128i *)dst + 1; |
| } |
| } |
| |
| extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, |
| const void *y1, const void *y2, |
| const void *y3, const void *y4, |
| const short *coef, uint32_t count) { |
| __m128i x; |
| __m128i c0, c2, c4, c6, c8, c10, c12; |
| __m128i c14, c16, c18, c20, c22, c24; |
| __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; |
| __m128i p0, p1, p2, p3, p4, p5, p6, p7; |
| __m128i p8, p9, p10, p11, p12, p13, p14, p15; |
| __m128i p16, p17, p18, p19, p20, p21, p22, p23; |
| __m128i p24, p25, p26, p27, p28, p29, p30, p31; |
| __m128i p32, p33, p34, p35, p36, p37, p38, p39; |
| __m128i o0, o1, o2, o3; |
| uint32_t i; |
| |
| x = _mm_loadl_epi64((const __m128i *)(coef+0)); |
| c0 = _mm_shuffle_epi32(x, 0x00); |
| c2 = _mm_shuffle_epi32(x, 0x55); |
| |
| x = _mm_loadl_epi64((const __m128i *)(coef+4)); |
| c4 = _mm_shuffle_epi32(x, 0x00); |
| c6 = _mm_shuffle_epi32(x, 0x55); |
| |
| x = _mm_loadl_epi64((const __m128i *)(coef+8)); |
| c8 = _mm_shuffle_epi32(x, 0x00); |
| c10 = _mm_shuffle_epi32(x, 0x55); |
| |
| x = _mm_loadl_epi64((const __m128i *)(coef+12)); |
| c12 = _mm_shuffle_epi32(x, 0x00); |
| c14 = _mm_shuffle_epi32(x, 0x55); |
| |
| x = _mm_loadl_epi64((const __m128i *)(coef+16)); |
| c16 = _mm_shuffle_epi32(x, 0x00); |
| c18 = _mm_shuffle_epi32(x, 0x55); |
| |
| x = _mm_loadl_epi64((const __m128i *)(coef+20)); |
| c20 = _mm_shuffle_epi32(x, 0x00); |
| c22 = _mm_shuffle_epi32(x, 0x55); |
| |
| x = _mm_loadl_epi64((const __m128i *)(coef+24)); |
| c24 = _mm_shuffle_epi32(x, 0x00); |
| |
| for (i = 0; i < count; ++i) { |
| |
| p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128()); |
| p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128()); |
| p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128()); |
| p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128()); |
| p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128()); |
| p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128()); |
| p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128()); |
| p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128()); |
| |
| p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128()); |
| p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128()); |
| p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128()); |
| p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128()); |
| p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128()); |
| p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128()); |
| p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128()); |
| p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128()); |
| |
| p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128()); |
| p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128()); |
| p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128()); |
| p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128()); |
| p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128()); |
| p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128()); |
| p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128()); |
| p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128()); |
| |
| p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128()); |
| p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128()); |
| p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128()); |
| p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128()); |
| p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128()); |
| p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128()); |
| p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128()); |
| p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128()); |
| |
| p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128()); |
| p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128()); |
| p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128()); |
| p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128()); |
| p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128()); |
| p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128()); |
| p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128()); |
| p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128()); |
| |
| o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22)); |
| o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24)); |
| o0 = _mm_srai_epi32(o0, 8); |
| |
| o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22)); |
| o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24)); |
| o1 = _mm_srai_epi32(o1, 8); |
| |
| o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22)); |
| o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24)); |
| o2 = _mm_srai_epi32(o2, 8); |
| |
| o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22)); |
| o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24)); |
| o3 = _mm_srai_epi32(o3, 8); |
| |
| o0 = packus_epi32(o0, o1); |
| o2 = packus_epi32(o2, o3); |
| o0 = _mm_packus_epi16(o0, o2); |
| _mm_storeu_si128((__m128i *)dst, o0); |
| |
| y0 = (const char *)y0 + 16; |
| y1 = (const char *)y1 + 16; |
| y2 = (const char *)y2 + 16; |
| y3 = (const char *)y3 + 16; |
| y4 = (const char *)y4 + 16; |
| dst = (char *)dst + 16; |
| } |
| } |
| |
| void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) { |
| __m128i all1s, ina, ins; |
| __m128i in0, in1, out0, out1; |
| __m128i t0, t1, t2, t3; |
| uint32_t i; |
| |
| all1s = _mm_set1_epi16(255); |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); |
| t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina)); |
| t0 = _mm_srli_epi16(t0, 8); |
| t0 = _mm_add_epi16(t0, ins); |
| |
| ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); |
| t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina)); |
| t1 = _mm_srli_epi16(t1, 8); |
| t1 = _mm_add_epi16(t1, ins); |
| |
| ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); |
| t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina)); |
| t2 = _mm_srli_epi16(t2, 8); |
| t2 = _mm_add_epi16(t2, ins); |
| |
| ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); |
| t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina)); |
| t3 = _mm_srli_epi16(t3, 8); |
| t3 = _mm_add_epi16(t3, ins); |
| |
| t0 = _mm_packus_epi16(t0, t1); |
| t2 = _mm_packus_epi16(t2, t3); |
| _mm_storeu_si128((__m128i *)dst, t0); |
| _mm_storeu_si128((__m128i *)dst + 1, t2); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) { |
| __m128i all1s, outa, outs; |
| __m128i in0, in1, out0, out1; |
| __m128i t0, t1, t2, t3; |
| uint32_t i; |
| |
| all1s = _mm_set1_epi16(255); |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| |
| outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); |
| t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa)); |
| t0 = _mm_srli_epi16(t0, 8); |
| t0 = _mm_add_epi16(t0, outs); |
| |
| outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); |
| t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa)); |
| t1 = _mm_srli_epi16(t1, 8); |
| t1 = _mm_add_epi16(t1, outs); |
| |
| outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); |
| t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa)); |
| t2 = _mm_srli_epi16(t2, 8); |
| t2 = _mm_add_epi16(t2, outs); |
| |
| outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); |
| t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa)); |
| t3 = _mm_srli_epi16(t3, 8); |
| t3 = _mm_add_epi16(t3, outs); |
| |
| t0 = _mm_packus_epi16(t0, t1); |
| t2 = _mm_packus_epi16(t2, t3); |
| _mm_storeu_si128((__m128i *)dst, t0); |
| _mm_storeu_si128((__m128i *)dst + 1, t2); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) { |
| __m128i outa; |
| __m128i in0, in1, out0, out1; |
| __m128i t0, t1, t2, t3; |
| uint32_t i; |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outa, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); |
| t0 = _mm_mullo_epi16(t0, outa); |
| t0 = _mm_srli_epi16(t0, 8); |
| |
| outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outa, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); |
| t1 = _mm_mullo_epi16(t1, outa); |
| t1 = _mm_srli_epi16(t1, 8); |
| |
| outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outa, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); |
| t2 = _mm_mullo_epi16(t2, outa); |
| t2 = _mm_srli_epi16(t2, 8); |
| |
| outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outa, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); |
| t3 = _mm_mullo_epi16(t3, outa); |
| t3 = _mm_srli_epi16(t3, 8); |
| |
| t0 = _mm_packus_epi16(t0, t1); |
| t2 = _mm_packus_epi16(t2, t3); |
| _mm_storeu_si128((__m128i *)dst, t0); |
| _mm_storeu_si128((__m128i *)dst + 1, t2); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) { |
| __m128i ina; |
| __m128i in0, in1, out0, out1; |
| __m128i t0, t1, t2, t3; |
| uint32_t i; |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ina, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); |
| t0 = _mm_mullo_epi16(t0, ina); |
| t0 = _mm_srli_epi16(t0, 8); |
| |
| ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ina, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); |
| t1 = _mm_mullo_epi16(t1, ina); |
| t1 = _mm_srli_epi16(t1, 8); |
| |
| ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ina, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); |
| t2 = _mm_mullo_epi16(t2, ina); |
| t2 = _mm_srli_epi16(t2, 8); |
| |
| ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ina, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); |
| t3 = _mm_mullo_epi16(t3, ina); |
| t3 = _mm_srli_epi16(t3, 8); |
| |
| t0 = _mm_packus_epi16(t0, t1); |
| t2 = _mm_packus_epi16(t2, t3); |
| _mm_storeu_si128((__m128i *)dst, t0); |
| _mm_storeu_si128((__m128i *)dst + 1, t2); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) { |
| __m128i all1s, outa; |
| __m128i in0, in1, out0, out1; |
| __m128i t0, t1, t2, t3; |
| uint32_t i; |
| |
| all1s = _mm_set1_epi16(255); |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outa, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); |
| t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa)); |
| t0 = _mm_srli_epi16(t0, 8); |
| |
| outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outa, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); |
| t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa)); |
| t1 = _mm_srli_epi16(t1, 8); |
| |
| outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outa, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); |
| t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa)); |
| t2 = _mm_srli_epi16(t2, 8); |
| |
| outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outa, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); |
| t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa)); |
| t3 = _mm_srli_epi16(t3, 8); |
| |
| t0 = _mm_packus_epi16(t0, t1); |
| t2 = _mm_packus_epi16(t2, t3); |
| _mm_storeu_si128((__m128i *)dst, t0); |
| _mm_storeu_si128((__m128i *)dst + 1, t2); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) { |
| __m128i all1s, ina; |
| __m128i in0, in1, out0, out1; |
| __m128i t0, t1, t2, t3; |
| uint32_t i; |
| |
| all1s = _mm_set1_epi16(255); |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ina, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); |
| t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina)); |
| t0 = _mm_srli_epi16(t0, 8); |
| |
| ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ina, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); |
| t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina)); |
| t1 = _mm_srli_epi16(t1, 8); |
| |
| ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ina, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); |
| t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina)); |
| t2 = _mm_srli_epi16(t2, 8); |
| |
| ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ina, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); |
| t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina)); |
| t3 = _mm_srli_epi16(t3, 8); |
| |
| t0 = _mm_packus_epi16(t0, t1); |
| t2 = _mm_packus_epi16(t2, t3); |
| _mm_storeu_si128((__m128i *)dst, t0); |
| _mm_storeu_si128((__m128i *)dst + 1, t2); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) { |
| const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000); |
| __m128i all1s, ina, outa, ins, outs; |
| __m128i in0, in1, out0, out1; |
| __m128i t0, t1, t2, t3; |
| uint32_t i; |
| |
| all1s = _mm_set1_epi16(255); |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t0 = _mm_sub_epi16(all1s, ina); |
| t0 = _mm_mullo_epi16(t0, outs); |
| t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins)); |
| t0 = _mm_srli_epi16(t0, 8); |
| |
| ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t1 = _mm_sub_epi16(all1s, ina); |
| t1 = _mm_mullo_epi16(t1, outs); |
| t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins)); |
| t1 = _mm_srli_epi16(t1, 8); |
| |
| ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t2 = _mm_sub_epi16(all1s, ina); |
| t2 = _mm_mullo_epi16(t2, outs); |
| t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins)); |
| t2 = _mm_srli_epi16(t2, 8); |
| |
| ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t3 = _mm_sub_epi16(all1s, ina); |
| t3 = _mm_mullo_epi16(t3, outs); |
| t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins)); |
| t3 = _mm_srli_epi16(t3, 8); |
| |
| t0 = _mm_packus_epi16(t0, t1); |
| t0 = blendv_epi8(t0, out0, M0001); |
| t2 = _mm_packus_epi16(t2, t3); |
| t2 = blendv_epi8(t2, out1, M0001); |
| _mm_storeu_si128((__m128i *)dst, t0); |
| _mm_storeu_si128((__m128i *)dst + 1, t2); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) { |
| const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000); |
| __m128i all1s, ina, ins, outa, outs; |
| __m128i in0, in1, out0, out1; |
| __m128i t0, t1, t2, t3; |
| uint32_t i; |
| |
| all1s = _mm_set1_epi16(255); |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t0 = _mm_sub_epi16(all1s, outa); |
| t0 = _mm_mullo_epi16(t0, ins); |
| t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs)); |
| t0 = _mm_srli_epi16(t0, 8); |
| |
| ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t1 = _mm_sub_epi16(all1s, outa); |
| t1 = _mm_mullo_epi16(t1, ins); |
| t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs)); |
| t1 = _mm_srli_epi16(t1, 8); |
| |
| ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t2 = _mm_sub_epi16(all1s, outa); |
| t2 = _mm_mullo_epi16(t2, ins); |
| t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs)); |
| t2 = _mm_srli_epi16(t2, 8); |
| |
| ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); |
| ina = _mm_shufflelo_epi16(ins, 0xFF); |
| ina = _mm_shufflehi_epi16(ina, 0xFF); |
| outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); |
| outa = _mm_shufflelo_epi16(outs, 0xFF); |
| outa = _mm_shufflehi_epi16(outa, 0xFF); |
| t3 = _mm_sub_epi16(all1s, outa); |
| t3 = _mm_mullo_epi16(t3, ins); |
| t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs)); |
| t3 = _mm_srli_epi16(t3, 8); |
| |
| t0 = _mm_packus_epi16(t0, t1); |
| t0 = blendv_epi8(t0, in0, M0001); |
| t2 = _mm_packus_epi16(t2, t3); |
| t2 = blendv_epi8(t2, in1, M0001); |
| _mm_storeu_si128((__m128i *)dst, t0); |
| _mm_storeu_si128((__m128i *)dst + 1, t2); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) { |
| __m128i in0, in1, out0, out1; |
| uint32_t i; |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| out0 = _mm_xor_si128(out0, in0); |
| out1 = _mm_xor_si128(out1, in1); |
| |
| _mm_storeu_si128((__m128i *)dst, out0); |
| _mm_storeu_si128((__m128i *)dst + 1, out1); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) { |
| __m128i in0, in1, out0, out1; |
| __m128i t0, t1, t2, t3; |
| uint32_t i; |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); |
| t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128())); |
| t0 = _mm_srli_epi16(t0, 8); |
| |
| t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); |
| t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128())); |
| t1 = _mm_srli_epi16(t1, 8); |
| |
| t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); |
| t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128())); |
| t2 = _mm_srli_epi16(t2, 8); |
| |
| t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); |
| t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128())); |
| t3 = _mm_srli_epi16(t3, 8); |
| |
| t0 = _mm_packus_epi16(t0, t1); |
| t2 = _mm_packus_epi16(t2, t3); |
| _mm_storeu_si128((__m128i *)dst, t0); |
| _mm_storeu_si128((__m128i *)dst + 1, t2); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) { |
| __m128i in0, in1, out0, out1; |
| uint32_t i; |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| out0 = _mm_adds_epu8(out0, in0); |
| out1 = _mm_adds_epu8(out1, in1); |
| |
| _mm_storeu_si128((__m128i *)dst, out0); |
| _mm_storeu_si128((__m128i *)dst + 1, out1); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |
| |
| void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) { |
| __m128i in0, in1, out0, out1; |
| uint32_t i; |
| |
| for (i = 0; i < count8; ++i) { |
| in0 = _mm_loadu_si128((const __m128i *)src); |
| in1 = _mm_loadu_si128((const __m128i *)src + 1); |
| out0 = _mm_loadu_si128((const __m128i *)dst); |
| out1 = _mm_loadu_si128((const __m128i *)dst + 1); |
| |
| out0 = _mm_subs_epu8(out0, in0); |
| out1 = _mm_subs_epu8(out1, in1); |
| |
| _mm_storeu_si128((__m128i *)dst, out0); |
| _mm_storeu_si128((__m128i *)dst + 1, out1); |
| |
| src = (const __m128i *)src + 2; |
| dst = (__m128i *)dst + 2; |
| } |
| } |