Vikas Arora | a241572 | 2012-08-09 16:18:58 -0700 | [diff] [blame] | 1 | // Copyright 2010 Google Inc. All Rights Reserved. |
Eric Hassold | 9aea642 | 2011-01-04 17:22:46 -0800 | [diff] [blame] | 2 | // |
Vikas Arora | 0406ce1 | 2013-08-09 15:57:12 -0700 | [diff] [blame] | 3 | // Use of this source code is governed by a BSD-style license |
| 4 | // that can be found in the COPYING file in the root of the source |
| 5 | // tree. An additional intellectual property rights grant can be found |
| 6 | // in the file PATENTS. All contributing project authors may |
| 7 | // be found in the AUTHORS file in the root of the source tree. |
Eric Hassold | 9aea642 | 2011-01-04 17:22:46 -0800 | [diff] [blame] | 8 | // ----------------------------------------------------------------------------- |
| 9 | // |
| 10 | // YUV->RGB conversion function |
| 11 | // |
| 12 | // Author: Skal (pascal.massimino@gmail.com) |
| 13 | |
Vikas Arora | a241572 | 2012-08-09 16:18:58 -0700 | [diff] [blame] | 14 | #include "./yuv.h" |
Eric Hassold | 9aea642 | 2011-01-04 17:22:46 -0800 | [diff] [blame] | 15 | |
Eric Hassold | 9aea642 | 2011-01-04 17:22:46 -0800 | [diff] [blame] | 16 | |
Vikas Arora | 8b72022 | 2014-01-02 16:48:02 -0800 | [diff] [blame] | 17 | #if defined(WEBP_YUV_USE_TABLE) |
Eric Hassold | 9aea642 | 2011-01-04 17:22:46 -0800 | [diff] [blame] | 18 | |
| 19 | static int done = 0; |
| 20 | |
Vikas Arora | a241572 | 2012-08-09 16:18:58 -0700 | [diff] [blame] | 21 | static WEBP_INLINE uint8_t clip(int v, int max_value) { |
Vikas Arora | 4667279 | 2011-07-13 16:37:55 +0530 | [diff] [blame] | 22 | return v < 0 ? 0 : v > max_value ? max_value : v; |
| 23 | } |
| 24 | |
Vikas Arora | 8b72022 | 2014-01-02 16:48:02 -0800 | [diff] [blame] | 25 | int16_t VP8kVToR[256], VP8kUToB[256]; |
| 26 | int32_t VP8kVToG[256], VP8kUToG[256]; |
| 27 | uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN]; |
| 28 | uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN]; |
| 29 | |
Vikas Arora | 03d5e34 | 2011-06-02 23:59:44 +0530 | [diff] [blame] | 30 | void VP8YUVInit(void) { |
Eric Hassold | 9aea642 | 2011-01-04 17:22:46 -0800 | [diff] [blame] | 31 | int i; |
| 32 | if (done) { |
| 33 | return; |
| 34 | } |
Vikas Arora | 1e7bf88 | 2013-03-13 16:43:18 -0700 | [diff] [blame] | 35 | #ifndef USE_YUVj |
Eric Hassold | 9aea642 | 2011-01-04 17:22:46 -0800 | [diff] [blame] | 36 | for (i = 0; i < 256; ++i) { |
| 37 | VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX; |
| 38 | VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF; |
| 39 | VP8kVToG[i] = -45773 * (i - 128); |
| 40 | VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX; |
| 41 | } |
| 42 | for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) { |
| 43 | const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX; |
Vikas Arora | 4667279 | 2011-07-13 16:37:55 +0530 | [diff] [blame] | 44 | VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255); |
| 45 | VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15); |
Eric Hassold | 9aea642 | 2011-01-04 17:22:46 -0800 | [diff] [blame] | 46 | } |
Vikas Arora | 1e7bf88 | 2013-03-13 16:43:18 -0700 | [diff] [blame] | 47 | #else |
| 48 | for (i = 0; i < 256; ++i) { |
| 49 | VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX; |
| 50 | VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF; |
| 51 | VP8kVToG[i] = -46802 * (i - 128); |
| 52 | VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX; |
| 53 | } |
| 54 | for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) { |
| 55 | const int k = i; |
| 56 | VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255); |
| 57 | VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15); |
| 58 | } |
| 59 | #endif |
| 60 | |
Eric Hassold | 9aea642 | 2011-01-04 17:22:46 -0800 | [diff] [blame] | 61 | done = 1; |
| 62 | } |
| 63 | |
Vikas Arora | 0406ce1 | 2013-08-09 15:57:12 -0700 | [diff] [blame] | 64 | #else |
| 65 | |
| 66 | void VP8YUVInit(void) {} |
| 67 | |
| 68 | #endif // WEBP_YUV_USE_TABLE |
| 69 | |
Vikas Arora | 8b72022 | 2014-01-02 16:48:02 -0800 | [diff] [blame] | 70 | //----------------------------------------------------------------------------- |
| 71 | // SSE2 extras |
| 72 | |
| 73 | #if defined(WEBP_USE_SSE2) |
| 74 | |
| 75 | #ifdef FANCY_UPSAMPLING |
| 76 | |
| 77 | #include <emmintrin.h> |
| 78 | #include <string.h> // for memcpy |
| 79 | |
| 80 | typedef union { // handy struct for converting SSE2 registers |
| 81 | int32_t i32[4]; |
| 82 | uint8_t u8[16]; |
| 83 | __m128i m; |
| 84 | } VP8kCstSSE2; |
| 85 | |
| 86 | static int done_sse2 = 0; |
| 87 | static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256]; |
| 88 | |
| 89 | void VP8YUVInitSSE2(void) { |
| 90 | if (!done_sse2) { |
| 91 | int i; |
| 92 | for (i = 0; i < 256; ++i) { |
| 93 | VP8kYtoRGBA[i].i32[0] = |
| 94 | VP8kYtoRGBA[i].i32[1] = |
| 95 | VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2; |
| 96 | VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2; |
| 97 | |
| 98 | VP8kUtoRGBA[i].i32[0] = 0; |
| 99 | VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128); |
| 100 | VP8kUtoRGBA[i].i32[2] = kUToB * (i - 128); |
| 101 | VP8kUtoRGBA[i].i32[3] = 0; |
| 102 | |
| 103 | VP8kVtoRGBA[i].i32[0] = kVToR * (i - 128); |
| 104 | VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128); |
| 105 | VP8kVtoRGBA[i].i32[2] = 0; |
| 106 | VP8kVtoRGBA[i].i32[3] = 0; |
| 107 | } |
| 108 | done_sse2 = 1; |
| 109 | } |
| 110 | } |
| 111 | |
| 112 | static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) { |
| 113 | const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m); |
| 114 | const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m); |
| 115 | const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m); |
| 116 | const __m128i uv_part = _mm_add_epi32(u_part, v_part); |
| 117 | const __m128i rgba1 = _mm_add_epi32(y_part, uv_part); |
| 118 | const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2); |
| 119 | return rgba2; |
| 120 | } |
| 121 | |
| 122 | static WEBP_INLINE void VP8YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v, |
| 123 | uint8_t* const rgb) { |
| 124 | const __m128i tmp0 = VP8GetRGBA32b(y, u, v); |
| 125 | const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0); |
| 126 | const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1); |
| 127 | // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp |
| 128 | _mm_storel_epi64((__m128i*)rgb, tmp2); |
| 129 | } |
| 130 | |
| 131 | static WEBP_INLINE void VP8YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v, |
| 132 | uint8_t* const bgr) { |
| 133 | const __m128i tmp0 = VP8GetRGBA32b(y, u, v); |
| 134 | const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2)); |
| 135 | const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1); |
| 136 | const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2); |
| 137 | // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp |
| 138 | _mm_storel_epi64((__m128i*)bgr, tmp3); |
| 139 | } |
| 140 | |
| 141 | void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 142 | uint8_t* dst) { |
| 143 | int n; |
| 144 | for (n = 0; n < 32; n += 4) { |
| 145 | const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); |
| 146 | const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); |
| 147 | const __m128i tmp0_3 = VP8GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]); |
| 148 | const __m128i tmp0_4 = VP8GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]); |
| 149 | const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); |
| 150 | const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); |
| 151 | const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); |
| 152 | _mm_storeu_si128((__m128i*)dst, tmp2); |
| 153 | dst += 4 * 4; |
| 154 | } |
| 155 | } |
| 156 | |
| 157 | void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 158 | uint8_t* dst) { |
| 159 | int n; |
| 160 | for (n = 0; n < 32; n += 2) { |
| 161 | const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); |
| 162 | const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); |
| 163 | const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); |
| 164 | const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); |
| 165 | const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); |
| 166 | const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); |
| 167 | _mm_storel_epi64((__m128i*)dst, tmp3); |
| 168 | dst += 4 * 2; |
| 169 | } |
| 170 | } |
| 171 | |
| 172 | void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 173 | uint8_t* dst) { |
| 174 | int n; |
| 175 | uint8_t tmp0[2 * 3 + 5 + 15]; |
| 176 | uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align |
| 177 | for (n = 0; n < 30; ++n) { // we directly stomp the *dst memory |
| 178 | VP8YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3); |
| 179 | } |
| 180 | // Last two pixels are special: we write in a tmp buffer before sending |
| 181 | // to dst. |
| 182 | VP8YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); |
| 183 | VP8YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); |
| 184 | memcpy(dst + n * 3, tmp, 2 * 3); |
| 185 | } |
| 186 | |
| 187 | void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 188 | uint8_t* dst) { |
| 189 | int n; |
| 190 | uint8_t tmp0[2 * 3 + 5 + 15]; |
| 191 | uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align |
| 192 | for (n = 0; n < 30; ++n) { |
| 193 | VP8YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3); |
| 194 | } |
| 195 | VP8YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); |
| 196 | VP8YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); |
| 197 | memcpy(dst + n * 3, tmp, 2 * 3); |
| 198 | } |
| 199 | |
| 200 | #else |
| 201 | |
| 202 | void VP8YUVInitSSE2(void) {} |
| 203 | |
| 204 | #endif // FANCY_UPSAMPLING |
| 205 | |
| 206 | #endif // WEBP_USE_SSE2 |
| 207 | |